In [33]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np



# Load data




In [34]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [42]:
# Merge datasets
product_merged = transactions.merge(products, on='ProductID')
customer_data = product_merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0] if len(x) > 0 else None#As tie breaker it picks the first mode
}).reset_index().rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'TransactionCount',
    'Category': 'PreferredCategory'
})
customers = customers.merge(customer_data, on='CustomerID', how='left')
print(customers.head())


  CustomerID        CustomerName         Region  SignupDate  TotalSpend_x  \
0      C0001    Lawrence Carroll  South America  2022-07-10       3354.52   
1      C0002      Elizabeth Lutz           Asia  2022-02-13       1862.74   
2      C0003      Michael Rivera  South America  2024-03-07       2725.38   
3      C0004  Kathleen Rodriguez  South America  2022-10-09       5354.88   
4      C0005         Laura Weber           Asia  2022-08-15       2034.24   

   TransactionCount_x PreferredCategory_x  TotalSpend_y  TransactionCount_y  \
0                 5.0         Electronics       3354.52                 5.0   
1                 4.0            Clothing       1862.74                 4.0   
2                 4.0          Home Decor       2725.38                 4.0   
3                 8.0               Books       5354.88                 8.0   
4                 3.0         Electronics       2034.24                 3.0   

  PreferredCategory_y  
0         Electronics  
1            C

# Feature Engineering
    -Encoding the aggrigates

In [36]:
# Feature engineering
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(customers[['Region']]).toarray()
category_encoded = encoder.fit_transform(customers[['PreferredCategory']]).toarray()
scaler = MinMaxScaler()
spent = scaler.fit_transform(customers[['TotalSpend']].fillna(0))



In [37]:
# Combine features into a matrix
features = np.hstack((region_encoded, category_encoded, spent))



# Fitting the seeds with a similarity function

In [38]:
similarity_matrix = cosine_similarity(features)#using cosine similarity


calculating top 3 look alikes for first 20 members i.e C001-C020

In [39]:
predictions = {}
for idx, cust_id in enumerate(customers['CustomerID'][:20]):
    sim_scores = similarity_matrix[idx]
    similar_indices = sim_scores.argsort()[::-1][1:4]  # Exclude self (highest similarity)
    prediction = [(customers['CustomerID'][i], sim_scores[i]) for i in similar_indices]
    predictions[cust_id] = prediction



Storing the prediction values in lookalike.csv

In [43]:
lookalike_df = pd.DataFrame({
    'CustomerID': [cust for cust in predictions.keys()],
    'Lookalike1': [l[0][0] for l in predictions.values()],
    'Score1': [l[0][1] for l in predictions.values()],
    'Lookalike2': [l[1][0] for l in predictions.values()],
    'Score2': [l[1][1] for l in predictions.values()],
    'Lookalike3': [l[2][0] for l in predictions.values()],
    'Score3': [l[2][1] for l in predictions.values()],
})
lookalike_df.to_csv('Lookalike.csv', index=False)



In [47]:
result = pd.read_csv('Lookalike.csv')
result.head(20)

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0181,1.0,C0184,0.999997,C0091,0.999906
1,C0002,C0088,0.999842,C0134,0.998602,C0106,0.997584
2,C0003,C0052,0.999883,C0076,0.999681,C0152,0.999117
3,C0004,C0155,0.999999,C0171,0.999923,C0168,0.999841
4,C0005,C0140,0.999955,C0186,0.999842,C0146,0.999397
5,C0006,C0169,0.999826,C0126,0.999787,C0187,0.999724
6,C0007,C0146,1.0,C0005,0.999377,C0115,0.999365
7,C0008,C0079,0.999999,C0160,0.996453,C0038,0.995106
8,C0009,C0198,0.999997,C0062,0.998806,C0010,0.998544
9,C0010,C0062,0.999987,C0111,0.999861,C0061,0.999624
