In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.preprocessing import StandardScaler


## Attributes for determining similarity between customers
####    -Recency(Time (in days) since last transaction)
####    -Frequency(No of transactions)
####    -Monetary Value(Total money spent)
####    -Region
####    -Tenure(No of days since the customer signed up)
####    -Average order value
####    -Transaction Diversity(no of differient products bought)

# Load data




In [3]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")


In [5]:
data = customers.merge(transactions, on='CustomerID')

In [6]:
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['TransactionDate'] = pd.to_datetime(data['TransactionDate'])

In [7]:
data.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2,114.6,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3,412.62,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2,614.94,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2,911.44,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3,1300.92,433.64


In [12]:
customer_data = data.groupby('CustomerID').agg(
    DifferentProducts = ('ProductID',pd.Series.nunique),
    Frequency=('TransactionID',pd.Series.nunique),
    TotalSpent=('TotalValue','sum'),
    Region=('Region','first'),
    LastTransaction=('TransactionDate','max'),
).reset_index()

customer_data['AverageOrderValue'] = customer_data['TotalSpent'] / customer_data['Frequency']
tenure_data = (pd.Timestamp.now() - data.groupby('CustomerID')['SignupDate'].first()).dt.days.reset_index()
tenure_data.columns = ['CustomerID', 'Tenure']
customer_data = pd.merge(customer_data, tenure_data, on='CustomerID')

In [13]:
customer_data['Regency'] = (pd.Timestamp.now()-customer_data['LastTransaction']).dt.days
customer_data.drop('LastTransaction',axis = 1,inplace=True)

In [14]:
customer_data.head()

Unnamed: 0,CustomerID,DifferentProducts,Frequency,TotalSpent,Region,AverageOrderValue,Tenure,Regency
0,C0001,5,5,3354.52,South America,670.904,938,91
1,C0002,4,4,1862.74,Asia,465.685,1085,61
2,C0003,4,4,2725.38,South America,681.345,332,161
3,C0004,8,8,5354.88,South America,669.36,847,40
4,C0005,3,3,2034.24,Asia,678.08,902,90


# Feature Engineering
    -Encoding the aggrigates

In [15]:
#Encoding the Region
encoder = OneHotEncoder(sparse_output=False)
values = encoder.fit_transform(customer_data[['Region']])
encoded_df = pd.DataFrame(values,columns=encoder.get_feature_names_out(['Region']))
customer_data = pd.concat([customer_data.drop('Region',axis=1), encoded_df],axis = 1)

In [16]:
customer_data.head()


Unnamed: 0,CustomerID,DifferentProducts,Frequency,TotalSpent,AverageOrderValue,Tenure,Regency,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,5,5,3354.52,670.904,938,91,0.0,0.0,0.0,1.0
1,C0002,4,4,1862.74,465.685,1085,61,1.0,0.0,0.0,0.0
2,C0003,4,4,2725.38,681.345,332,161,0.0,0.0,0.0,1.0
3,C0004,8,8,5354.88,669.36,847,40,0.0,0.0,0.0,1.0
4,C0005,3,3,2034.24,678.08,902,90,1.0,0.0,0.0,0.0


In [18]:
features = customer_data.drop(['CustomerID'],axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Fitting the seeds with a similarity function

In [19]:
similarity_matrix = cosine_similarity(scaled_features)#using cosine similarity


calculating top 3 look alikes for first 20 members i.e C001-C020

In [20]:
predictions = {}
for idx, cust_id in enumerate(customers['CustomerID'][:20]):
    sim_scores = similarity_matrix[idx]
    similar_indices = sim_scores.argsort()[::-1][1:4]  # Exclude self (highest similarity)
    prediction = [(customers['CustomerID'][i], sim_scores[i]) for i in similar_indices]
    predictions[cust_id] = prediction



In [21]:
print(predictions)

{'C0001': [('C0152', 0.9917852894572377), ('C0107', 0.945496697125453), ('C0174', 0.9313937115698868)], 'C0002': [('C0134', 0.9621203308045908), ('C0106', 0.9284638998541984), ('C0159', 0.9110165642083544)], 'C0003': [('C0052', 0.9708910685877357), ('C0129', 0.9063487821595342), ('C0085', 0.8679702447960002)], 'C0004': [('C0113', 0.968062693420116), ('C0099', 0.9505367697834806), ('C0155', 0.9474299166189872)], 'C0005': [('C0159', 0.9813749015934503), ('C0027', 0.9599321513899808), ('C0002', 0.906252003680838)], 'C0006': [('C0158', 0.9071773036414281), ('C0186', 0.9048025855825665), ('C0148', 0.8896970446500475)], 'C0007': [('C0005', 0.9007774081645233), ('C0027', 0.8894934651164457), ('C0040', 0.8718960219841274)], 'C0008': [('C0098', 0.9183453269638685), ('C0156', 0.8864565907074289), ('C0109', 0.8859520168528142)], 'C0009': [('C0121', 0.9738457731322073), ('C0010', 0.9150733918601853), ('C0060', 0.9114952466885554)], 'C0010': [('C0198', 0.9877784034015694), ('C0009', 0.9150733918601

Storing the prediction values in lookalike.csv

In [22]:
lookalike_df = pd.DataFrame({
    'CustomerID': [cust for cust in predictions.keys()],
    'Lookalike1': [l[0][0] for l in predictions.values()],
    'Score1': [l[0][1] for l in predictions.values()],
    'Lookalike2': [l[1][0] for l in predictions.values()],
    'Score2': [l[1][1] for l in predictions.values()],
    'Lookalike3': [l[2][0] for l in predictions.values()],
    'Score3': [l[2][1] for l in predictions.values()],
})
lookalike_df.to_csv('Lookalike.csv', index=False)



In [23]:
result = pd.read_csv('Lookalike.csv')
result.head(20)

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0152,0.991785,C0107,0.945497,C0174,0.931394
1,C0002,C0134,0.96212,C0106,0.928464,C0159,0.911017
2,C0003,C0052,0.970891,C0129,0.906349,C0085,0.86797
3,C0004,C0113,0.968063,C0099,0.950537,C0155,0.94743
4,C0005,C0159,0.981375,C0027,0.959932,C0002,0.906252
5,C0006,C0158,0.907177,C0186,0.904803,C0148,0.889697
6,C0007,C0005,0.900777,C0027,0.889493,C0040,0.871896
7,C0008,C0098,0.918345,C0156,0.886457,C0109,0.885952
8,C0009,C0121,0.973846,C0010,0.915073,C0060,0.911495
9,C0010,C0198,0.987778,C0009,0.915073,C0121,0.908939
