# import The packages

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Read the data

In [2]:
customers = pd.read_csv('Customers.csv')
transactions= pd.read_csv('Transactions.csv')
products= pd.read_csv('Products.csv')

# Merge

In [3]:
merged = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')

# Feature Engineering

In [4]:
# Aggregate transaction data
agg_features = merged.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    avg_spend=('TotalValue', 'mean'),
    total_transactions=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Merge with customer profile data
customer_profiles = customers.merge(agg_features, on='CustomerID', how='left')

# Encode categorical variables
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

# Fill missing values (if any)
customer_profiles.fillna(0, inplace=True)

# Select only numeric columns for normalization
numeric_columns = customer_profiles.select_dtypes(include=['number']).columns

# Normalize numeric features
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(customer_profiles[numeric_columns])

# Replace original columns with normalized values
customer_profiles[numeric_columns] = normalized_features

# Compute Similarities

In [5]:
similarity_matrix = cosine_similarity(customer_profiles[numeric_columns])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])

In [6]:
similarity_df

CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.000000,0.993771,0.993545,0.980844,0.967662,0.952069,0.938465,0.922757,0.975904,0.989538,...,0.998474,0.997206,0.987682,0.976947,0.996745,0.904844,0.972884,0.953009,0.996036,0.984948
C0002,0.993771,1.000000,0.982452,0.982439,0.951485,0.922103,0.913053,0.945950,0.994122,0.999275,...,0.997591,0.998671,0.972084,0.985519,0.993597,0.864089,0.958917,0.943642,0.999576,0.962107
C0003,0.993545,0.982452,1.000000,0.952428,0.990019,0.978093,0.971230,0.875600,0.959088,0.974640,...,0.987046,0.990758,0.998789,0.947271,0.981177,0.942173,0.992744,0.979723,0.987465,0.994185
C0004,0.980844,0.982439,0.952428,1.000000,0.899988,0.879415,0.854183,0.974892,0.973892,0.984334,...,0.987666,0.977606,0.938947,0.997949,0.993353,0.814683,0.909251,0.877740,0.980030,0.940040
C0005,0.967662,0.951485,0.990019,0.899988,1.000000,0.991948,0.994070,0.802284,0.922415,0.939524,...,0.955355,0.965480,0.994402,0.893820,0.944182,0.970208,0.999670,0.995888,0.959702,0.986995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C0196,0.904844,0.864089,0.942173,0.814683,0.970208,0.990228,0.987844,0.673315,0.811866,0.844432,...,0.880665,0.888703,0.957270,0.795411,0.872003,1.000000,0.964363,0.956487,0.878277,0.962583
C0197,0.972884,0.958917,0.992744,0.909251,0.999670,0.989306,0.990949,0.816626,0.932031,0.947884,...,0.961948,0.971637,0.995869,0.904107,0.951084,0.964363,1.000000,0.995696,0.966416,0.987823
C0198,0.953009,0.943642,0.979723,0.877740,0.995888,0.979341,0.989405,0.785587,0.920676,0.932215,...,0.941793,0.956931,0.983519,0.876313,0.926219,0.956487,0.995696,1.000000,0.951458,0.969218
C0199,0.996036,0.999576,0.987465,0.980030,0.959702,0.932976,0.924247,0.937160,0.990956,0.997744,...,0.998255,0.999748,0.978502,0.981831,0.993959,0.878277,0.966416,0.951458,1.000000,0.969341


# Generate Lookalikes

In [7]:
lookalike_results = {}
customer_ids = customer_profiles['CustomerID'][:20]  #first 20 customer 
for customer_id in customer_ids:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  
    lookalike_results[customer_id] = [(similar_id, round(score, 4)) for similar_id, score in similar_customers.items()]

# save the result

In [8]:
lookalike_data = []
for cust_id, lookalikes in lookalike_results.items():
    lookalike_data.append({'cust_id': cust_id, 'lookalikes': lookalikes})

lookalike= pd.DataFrame(lookalike_data)

In [9]:
lookalike

Unnamed: 0,cust_id,lookalikes
0,C0001,"[(C0137, 1.0), (C0152, 1.0), (C0056, 0.9997)]"
1,C0002,"[(C0029, 0.9998), (C0199, 0.9996), (C0031, 0.9..."
2,C0003,"[(C0178, 1.0), (C0035, 0.9997), (C0133, 0.9996)]"
3,C0004,"[(C0021, 1.0), (C0124, 0.9993), (C0173, 0.9993)]"
4,C0005,"[(C0073, 1.0), (C0159, 1.0), (C0112, 0.9998)]"
5,C0006,"[(C0079, 1.0), (C0117, 0.9998), (C0158, 0.9983)]"
6,C0007,"[(C0085, 1.0), (C0120, 0.9998), (C0042, 0.9995)]"
7,C0008,"[(C0161, 0.9992), (C0098, 0.9992), (C0147, 0.9..."
8,C0009,"[(C0077, 0.9996), (C0025, 0.9984), (C0010, 0.9..."
9,C0010,"[(C0029, 0.9998), (C0025, 0.9994), (C0002, 0.9..."


In [11]:
lookalike.to_csv("Mayur_Bhagat_Lookalike.csv",index=False)