In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [3]:
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')

In [4]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [6]:
# Aggregate transaction data for each customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': lambda x: x.mode()[0],  # Most frequent region
    'Category': lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

In [7]:
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

In [8]:
scaler = StandardScaler()
numerical_cols = ['TotalValue', 'Quantity']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

In [9]:
customer_ids = customer_features['CustomerID']
feature_matrix = customer_features.drop('CustomerID', axis=1)
similarity_matrix = cosine_similarity(feature_matrix)

In [10]:
# Build lookalike recommendations
lookalike_data = {}
for idx, customer_id in enumerate(customer_ids):
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]  # Get top 3 similar customers (excluding self)
    similar_customers = [(customer_ids[i], similarity_matrix[idx][i]) for i in similar_indices]
    lookalike_data[customer_id] = similar_customers

In [11]:
lookalike_filtered = {cid: lookalike_data[cid] for cid in customer_ids if cid in [f"C{str(i).zfill(4)}" for i in range(1, 21)]}

In [12]:
# Save results to Lookalike.csv
lookalike_list = []
for cid, recommendations in lookalike_filtered.items():
    for rec in recommendations:
        lookalike_list.append([cid, rec[0], rec[1]])
lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed and results saved to Lookalike.csv.")

Lookalike model completed and results saved to Lookalike.csv.
