# Data Preprocessing

## Loading datasets and creating a merged dataframe

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [None]:
#Loading the dataset and saving it as dataframes
customers=pd.read_csv('Customers.csv')
products=pd.read_csv('Products.csv')
transactions=pd.read_csv('Transactions.csv')

In [None]:
customers['SignupDate']=pd.to_datetime(customers['SignupDate'])
transactions[['TransactionDate', 'TransactionTime']] = transactions['TransactionDate'].str.split(' ', n=1, expand=True)
transactions['TransactionDate']=pd.to_datetime(transactions['TransactionDate'])
transactions['TransactionTime']=pd.to_datetime(transactions['TransactionTime'])

  transactions['TransactionTime']=pd.to_datetime(transactions['TransactionTime'])


In [None]:
merged_df=transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')
merged_df=merged_df.drop('Price_y', axis=1)
merged_df.rename(columns={'Price_x': 'Price'}, inplace=True)
merged_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,TransactionTime,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25,1,300.68,300.68,2025-01-26 12:38:23,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27,1,300.68,300.68,2025-01-26 22:23:54,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25,1,300.68,300.68,2025-01-26 07:38:55,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26,2,601.36,300.68,2025-01-26 22:55:37,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21,3,902.04,300.68,2025-01-26 15:10:10,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics


## Adding metrics

In [None]:
product_demand = merged_df.groupby('ProductID')['Quantity'].sum().reset_index()
product_demand.rename(columns={'Quantity': 'TotalDemand'}, inplace=True)
scaler = MinMaxScaler()
product_demand['DemandScore'] = scaler.fit_transform(product_demand[['TotalDemand']])
product_demand = product_demand.merge(products[['ProductID', 'ProductName']], on='ProductID', how='left')
merged_df = merged_df.merge(product_demand[['ProductID', 'DemandScore']], on='ProductID', how='left')

In [None]:
product_quantities = merged_df.groupby('ProductID')['Quantity'].sum().reset_index()
min_quantity = product_quantities['Quantity'].min()
max_quantity = product_quantities['Quantity'].max()
product_quantities['SellingScore'] = 100 * (product_quantities['Quantity'] - min_quantity) / (max_quantity - min_quantity)

In [None]:
product_revenue = merged_df.groupby('ProductID')['TotalValue'].sum().reset_index()
product_revenue.rename(columns={'TotalValue': 'TotalRevenue'}, inplace=True)
product_revenue['RevenueScore'] = MinMaxScaler().fit_transform(product_revenue[['TotalRevenue']])
merged_df = merged_df.merge(product_revenue[['ProductID', 'TotalRevenue', 'RevenueScore']], on='ProductID', how='left')

In [None]:
customer_revenue = merged_df.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_revenue.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)
customer_revenue['CustomerTier'] = pd.qcut(customer_revenue['TotalSpending'], q=4, labels=['Low', 'Medium', 'High', 'VIP'])
merged_df = merged_df.merge(customer_revenue[['CustomerID', 'CustomerTier']], on='CustomerID', how='left')

In [None]:
merged_df.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price', 'TransactionTime', 'CustomerName',
       'Region', 'SignupDate', 'ProductName', 'Category', 'DemandScore',
       'TotalRevenue', 'RevenueScore', 'CustomerTier'],
      dtype='object')

# Model Building

## Importing libraries

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

## Finding similarity

In [None]:
customer_features = merged_df[['CustomerID', 'TotalRevenue', 'CustomerTier', 'Region', 'DemandScore', 'RevenueScore']]

customer_features = pd.get_dummies(customer_features, columns=['CustomerTier', 'Region'])

customer_profile = merged_df.groupby('CustomerID').agg({
    'TotalRevenue': 'sum',
    'DemandScore': 'mean',
    'RevenueScore': 'mean',
    'Quantity': 'sum'
}).reset_index()

customer_profile_normalized = (customer_profile.drop(columns='CustomerID') - customer_profile.drop(columns='CustomerID').mean()) / customer_profile.drop(columns='CustomerID').std()
similarity_matrix = cosine_similarity(customer_profile_normalized)

def recommend_similar_customers(customer_id, customer_profile, similarity_matrix, top_n=3):
    if customer_id not in customer_profile['CustomerID'].values:
        raise ValueError(f"Customer ID {customer_id} not found in the dataset.")

    customer_index = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]

    similarity_scores = similarity_matrix[customer_index]

    similar_customers = [(customer_profile.iloc[i]['CustomerID'], similarity_scores[i])
                         for i in range(len(similarity_scores)) if customer_profile.iloc[i]['CustomerID'] != customer_id]

    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:top_n]

    return similar_customers

## Creating csv file for first 20 customers

In [None]:
first_20_customers = customers['CustomerID'].head(20)
lookalike_map = {}

for customer_id in first_20_customers:
    try:
        recommendations = recommend_similar_customers(
            customer_id, customer_profile, similarity_matrix, top_n=3
        )
        lookalike_map[customer_id] = recommendations
    except ValueError:
        print(f"Customer ID {customer_id} not found in the dataset.")

lookalike_df = pd.DataFrame([
    {
        'cust_id': key,
        'similar_customer_ids': str(value)
    }
    for key, value in lookalike_map.items()
])

In [None]:
lookalike_df.to_csv('Niranjana_J_Lookalike.csv', index=False)

print("Niranjana_J_Lookalike.csv has been created successfully.")

Niranjana_J_Lookalike.csv has been created successfully.


In [None]:
new_df=pd.read_csv('Niranjana_J_Lookalike.csv')
new_df

Unnamed: 0,cust_id,similar_customer_ids
0,C0001,"[('C0064', 0.9866876047020321), ('C0183', 0.97..."
1,C0002,"[('C0031', 0.9952676542464393), ('C0036', 0.99..."
2,C0003,"[('C0160', 0.9670787404153591), ('C0086', 0.96..."
3,C0004,"[('C0045', 0.9771513739602529), ('C0113', 0.97..."
4,C0005,"[('C0080', 0.9842578026282008), ('C0085', 0.97..."
5,C0006,"[('C0026', 0.9949204802838021), ('C0135', 0.97..."
6,C0007,"[('C0078', 0.9732522289917703), ('C0085', 0.96..."
7,C0008,"[('C0108', 0.9974711312538116), ('C0179', 0.96..."
8,C0009,"[('C0092', 0.9760803052781241), ('C0186', 0.95..."
9,C0010,"[('C0142', 0.9852296901232482), ('C0055', 0.97..."
