In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


In [3]:
# Load datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Merge dataframes to create a unified dataset
data = pd.merge(transactions_df, customers_df, on="CustomerID", how="inner")
data = pd.merge(data, products_df, on="ProductID", how="inner")

In [4]:
data.drop(columns='Price_x', inplace=True)
data.rename(columns={"Price_y": "Product Price"}, inplace=True)

In [5]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category,Product Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [6]:
# One-hot encode categorical features
categorical_features = ["Region", "ProductName", "Category"]
encoder = OneHotEncoder(sparse_output=False)
encoded_features = pd.DataFrame(
    encoder.fit_transform(data[categorical_features]),
    columns=encoder.get_feature_names_out(categorical_features)
)

In [7]:
data.columns 

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'CustomerName', 'Region', 'SignupDate',
       'ProductName', 'Category', 'Product Price'],
      dtype='object')

In [8]:
# Combine encoded features with numerical features
numerical_features = ["Quantity", "TotalValue", "Product Price"]
numerical_data = data[numerical_features].reset_index(drop=True)
combined_features = pd.concat([numerical_data, encoded_features], axis=1)

In [9]:
combined_features.head()

Unnamed: 0,Quantity,TotalValue,Product Price,Region_Asia,Region_Europe,Region_North America,Region_South America,ProductName_ActiveWear Biography,ProductName_ActiveWear Cookbook,ProductName_ActiveWear Cookware Set,...,ProductName_TechPro Rug,ProductName_TechPro Running Shoes,ProductName_TechPro Smartwatch,ProductName_TechPro T-Shirt,ProductName_TechPro Textbook,ProductName_TechPro Vase,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,1,300.68,300.68,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,300.68,300.68,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,300.68,300.68,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2,601.36,300.68,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,3,902.04,300.68,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
# Compute similarity scores for customers
def compute_customer_similarity(data, customer_id):
    customer_index = data[data["CustomerID"] == customer_id].index[0]
    similarity_scores = cosine_similarity(
        [combined_features.iloc[customer_index]], combined_features
    )[0]
    
    # Rank top 3 most similar customers
    top_similar_indices = np.argsort(similarity_scores)[::-1][1:4]
    top_customers = data.iloc[top_similar_indices]["CustomerID"].values
    top_scores = similarity_scores[top_similar_indices]

    return list(zip(top_customers, top_scores))

In [11]:
# Generate lookalike data for the first 20 customers
lookalike_data = {}
for customer_id in customers_df["CustomerID"].iloc[:20]:
    lookalike_data[customer_id] = compute_customer_similarity(data, customer_id)

In [12]:
pd.DataFrame(lookalike_data)

Unnamed: 0,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,C0011,C0012,C0013,C0014,C0015,C0016,C0017,C0018,C0019,C0020
0,"(C0165, 0.9999999999999999)","(C0177, 0.9999962475227601)","(C0012, 0.9999989650378566)","(C0112, 0.9999973815781583)","(C0162, 0.9999999999999999)","(C0171, 0.9999999999999997)","(C0090, 0.999997805520203)","(C0016, 0.9999951313801476)","(C0009, 1.0000000000000002)","(C0132, 0.9999999999999998)","(C0085, 1.0)","(C0012, 1.0)","(C0107, 0.9999993006099945)","(C0046, 0.9999921220914797)","(C0139, 0.9999999999999997)","(C0024, 0.9999974530645106)","(C0179, 0.9999989892179152)","(C0087, 0.9999954317879719)","(C0038, 0.9999695538728994)","(C0117, 1.0000000000000002)"
1,"(C0165, 0.9999999999999999)","(C0017, 0.999996127671267)","(C0158, 0.9999989650378566)","(C0102, 0.9999973815781583)","(C0162, 0.9999988046003704)","(C0071, 0.9999920187872187)","(C0173, 0.9999974874522678)","(C0109, 0.9999951254960043)","(C0103, 0.9999966973287835)","(C0010, 0.9999999999999998)","(C0147, 0.9999943327811323)","(C0104, 0.999999015506147)","(C0107, 0.9999992992035014)","(C0106, 0.9999921220914797)","(C0127, 0.9999979058580748)","(C0156, 0.9999974355755248)","(C0114, 0.9999989892179152)","(C0030, 0.99999411592467)","(C0171, 0.9999695538728994)","(C0050, 0.9999998237490774)"
2,"(C0001, 0.9999999999999999)","(C0106, 0.9999959424261735)","(C0039, 0.9999987815679727)","(C0102, 0.9999973815781583)","(C0136, 0.9999987815679727)","(C0113, 0.999991942185381)","(C0186, 0.9999974874522678)","(C0098, 0.9999951170930266)","(C0154, 0.9999818982751072)","(C0019, 0.9999915072483314)","(C0004, 0.9999942691687541)","(C0187, 0.999999015506147)","(C0099, 0.9999992992035014)","(C0128, 0.9999921220914797)","(C0008, 0.9999976761158293)","(C0016, 0.9999974355755248)","(C0105, 0.9999989838346195)","(C0109, 0.99999411592467)","(C0181, 0.9999695538728994)","(C0068, 0.999998981046303)"


In [13]:
# Save results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": lookalikes}
    for cust_id, lookalikes in lookalike_data.items()
])
lookalike_df.to_csv("Anubhav_Prasad_Lookalike.csv", index=False)

print("Lookalike model complete. Results saved to Anubhav_Prasad_Lookalike.csv.")

Lookalike model complete. Results saved to Anubhav_Prasad_Lookalike.csv.
