In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [28]:
df=pd.read_csv("Resources/Merged_dataset.csv")

In [29]:
df.columns

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'DayOfWeek', 'Monthly'],
      dtype='object')

In [30]:
customer_features = df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Quantity': 'sum',
    'Region': 'first',
    'Category': lambda x: x.mode()[0]
}).reset_index()

customer_features.rename(columns={
    'TransactionID': 'TotalTransactions',
    'Quantity': 'TotalQuantity',
    'Category': 'PreferredCategory'
}, inplace=True)

print(customer_features.head())


  CustomerID  TotalValue  TotalTransactions  TotalQuantity         Region  \
0      C0001     3354.52                  5             12  South America   
1      C0002     1862.74                  4             10           Asia   
2      C0003     2725.38                  4             14  South America   
3      C0004     5354.88                  8             23  South America   
4      C0005     2034.24                  3              7           Asia   

  PreferredCategory  
0       Electronics  
1          Clothing  
2        Home Decor  
3             Books  
4       Electronics  


In [31]:
from sklearn.preprocessing import MinMaxScaler

numerical_features = ['TotalValue', 'TotalTransactions', 'TotalQuantity']
scaler = MinMaxScaler()


customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])


In [32]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(customer_features[numerical_features])

similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


Final

1. Filter First 20 Customers as Input

In [33]:
# Filter only the first 20 customers (C0001 to C0020)
target_customers = ['C0001', 'C0002', 'C0003', 'C0004', 'C0005', 
                    'C0006', 'C0007', 'C0008', 'C0009', 'C0010', 
                    'C0011', 'C0012', 'C0013', 'C0014', 'C0015', 
                    'C0016', 'C0017', 'C0018', 'C0019', 'C0020']
first_20_customers = customer_features[customer_features['CustomerID'].isin(target_customers)]

2. Compute Pairwise Similarity Across All Customers

In [34]:
# Compute similarity between all customers
similarity_matrix = cosine_similarity(customer_features[numerical_features])

In [35]:
# Convert similarity matrix to a DataFrame for easier lookup
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=customer_features['CustomerID'], 
                             columns=customer_features['CustomerID'])

Finding `Top 3` Similar Customers for Each of the `20` Input Customers

In [36]:
# Prepare Lookalike.csv data
lookalike_data = []
for customer in target_customers:
    # Sort similarity scores for the current customer
    top_similar = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    
    # Append the top 3 lookalikes with scores
    lookalike_data.append([
        customer,
        top_similar.index[0], top_similar.values[0],
        top_similar.index[1], top_similar.values[1],
        top_similar.index[2], top_similar.values[2]
    ])



In [37]:
lookalike_df = pd.DataFrame(lookalike_data, columns=[
    'CustomerID', 'lookalike1', 'score1', 'lookalike2', 'score2', 'lookalike3', 'score3'
])
lookalike_df.to_csv('Shubham_Mishra_Lookalike.csv', index=False)