In [4]:
import pandas as pd
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [6]:
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [7]:
print(data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [8]:
customer_profiles = data.groupby('CustomerID').agg({
    'ProductName': lambda x: ' '.join(x),
    'TotalValue': 'sum'
}).reset_index()

customer_profiles = customer_profiles.merge(customers, on='CustomerID')

print(customer_profiles.head())


  CustomerID                                        ProductName  TotalValue  \
0      C0001  HomeSense Wall Art TechPro Headphones ActiveWe...     3354.52   
1      C0002  BookWorld Cookware Set BookWorld Rug ComfortLi...     1862.74   
2      C0003  ActiveWear T-Shirt ActiveWear Rug ActiveWear C...     2725.38   
3      C0004  BookWorld Bluetooth Speaker TechPro Rug TechPr...     5354.88   
4      C0005  TechPro Smartwatch ActiveWear Cookware Set Com...     2034.24   

         CustomerName         Region  SignupDate  
0    Lawrence Carroll  South America  2022-07-10  
1      Elizabeth Lutz           Asia  2022-02-13  
2      Michael Rivera  South America  2024-03-07  
3  Kathleen Rodriguez  South America  2022-10-09  
4         Laura Weber           Asia  2022-08-15  


In [9]:
print(customer_profiles.columns)


Index(['CustomerID', 'ProductName', 'TotalValue', 'CustomerName', 'Region',
       'SignupDate'],
      dtype='object')


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(customer_profiles['ProductName'])

similarity_matrix = cosine_similarity(tfidf_matrix)


print(similarity_matrix[:5, :5])  


[[1.         0.05039581 0.20372759 0.35351253 0.42482621]
 [0.05039581 1.         0.4355352  0.50105143 0.50000067]
 [0.20372759 0.4355352  1.         0.445044   0.60062276]
 [0.35351253 0.50105143 0.445044   1.         0.48920844]
 [0.42482621 0.50000067 0.60062276 0.48920844 1.        ]]


In [11]:
lookalikes = {}

for idx, customer_id in enumerate(customer_profiles['CustomerID'][:20]):
    similar_customers = sorted(
        list(enumerate(similarity_matrix[idx])),
        key=lambda x: -x[1]
    )[1:4]  # Exclude the customer itself
    lookalikes[customer_id] = [
        (customer_profiles['CustomerID'][i], round(score, 3))
        for i, score in similar_customers
    ]

for cust_id, similar in lookalikes.items():
    print(f"Customer {cust_id}: {similar}")


Customer C0001: [('C0197', 0.817), ('C0026', 0.737), ('C0100', 0.699)]
Customer C0002: [('C0133', 0.884), ('C0173', 0.766), ('C0109', 0.66)]
Customer C0003: [('C0164', 0.723), ('C0085', 0.658), ('C0181', 0.648)]
Customer C0004: [('C0118', 0.74), ('C0008', 0.719), ('C0075', 0.709)]
Customer C0005: [('C0128', 0.76), ('C0096', 0.746), ('C0014', 0.743)]
Customer C0006: [('C0187', 0.756), ('C0191', 0.675), ('C0139', 0.63)]
Customer C0007: [('C0181', 0.742), ('C0031', 0.686), ('C0118', 0.659)]
Customer C0008: [('C0057', 0.852), ('C0143', 0.777), ('C0075', 0.725)]
Customer C0009: [('C0093', 0.7), ('C0156', 0.699), ('C0062', 0.699)]
Customer C0010: [('C0058', 0.763), ('C0092', 0.715), ('C0040', 0.69)]
Customer C0011: [('C0084', 0.677), ('C0094', 0.644), ('C0091', 0.64)]
Customer C0012: [('C0136', 0.814), ('C0076', 0.74), ('C0148', 0.692)]
Customer C0013: [('C0040', 0.764), ('C0102', 0.723), ('C0153', 0.701)]
Customer C0014: [('C0128', 0.959), ('C0005', 0.743), ('C0012', 0.657)]
Customer C0015:

In [13]:
import csv

with open('Mannu_patel_Lookalike.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['cust_id', 'lookalikes'])
    for cust_id, similar in lookalikes.items():
        writer.writerow([cust_id, similar])
