# Look Alike Model
    -By using certain attributes from the given data set (which determine the customer shopping behaviour), look alikes for a customers has been predicted.

    -Each customer/seed's attributes are scaled to eliminate bias if any present in the data. MinMax sacaler is employed for this purpose. 
    
    -Cosine similiraty function gives us the similarity between two customers

In [32]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity


## Attributes for determining similarity between customers
    -Total amount spent overall

    -Region

    -Average amounts spent on each category

    -Frequency at which the category product is bought

# Load data




In [42]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")


## Preparing the data and deriving necessary attributes

In [44]:
data = customers.merge(transactions, on='CustomerID')
data = data.merge(products, on= 'ProductID')

In [54]:
data['SignupDate'] = pd.to_datetime(data['SignupDate'])
data['TransactionDate'] = pd.to_datetime(data['TransactionDate'])
data = data.drop('Price_x',axis = 1)

In [55]:
data.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2,114.6,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3,412.62,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2,614.94,SoundWave Headphones,Electronics,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2,911.44,ActiveWear Smartwatch,Electronics,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3,1300.92,TechPro Headphones,Electronics,433.64


In [160]:
customer_data = data.groupby('CustomerID').agg(
    TotalSpent=('TotalValue','sum'),
    Region=('Region','first'),
).reset_index()

In [165]:
categ_freq = data.groupby(['CustomerID','Category']).size().unstack(fill_value=0).reset_index()

In [162]:
categ_freq.head()

Category,CustomerID,Books,Clothing,Electronics,Home Decor
0,C0001,1,0,3,1
1,C0002,0,2,0,2
2,C0003,0,1,1,2
3,C0004,3,0,2,3
4,C0005,0,0,2,1


In [163]:
catg_avg = data.groupby(['CustomerID', 'Category'])['TotalValue'].mean().reset_index()
catg_avg = catg_avg.pivot(index='CustomerID', columns='Category', values='TotalValue').fillna(0)

In [166]:
catg_avg.head()

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,114.6,0.0,942.433333,412.62
C0002,0.0,512.73,0.0,418.64
C0003,0.0,122.36,1385.2,608.91
C0004,629.493333,0.0,677.87,703.553333
C0005,0.0,0.0,590.19,853.86


In [None]:
customer_data = customer_data.merge(catg_avg,on='CustomerID')
customer_data = customer_data.merge(categ_freq,on='CustomerID')

In [168]:
customer_data.head()

Unnamed: 0,CustomerID,TotalSpent,Region,Books_x,Clothing_x,Electronics_x,Home Decor_x,Books_y,Clothing_y,Electronics_y,Home Decor_y
0,C0001,3354.52,South America,114.6,0.0,942.433333,412.62,1,0,3,1
1,C0002,1862.74,Asia,0.0,512.73,0.0,418.64,0,2,0,2
2,C0003,2725.38,South America,0.0,122.36,1385.2,608.91,0,1,1,2
3,C0004,5354.88,South America,629.493333,0.0,677.87,703.553333,3,0,2,3
4,C0005,2034.24,Asia,0.0,0.0,590.19,853.86,0,0,2,1


# Feature Engineering
    -Encoding the aggregates

### It may be noted that categories have been encoded using frequency encoding
### Region is being encoded using One hot encoding

In [169]:
#Encoding the Region
encoder = OneHotEncoder(sparse_output=False)
values = encoder.fit_transform(customer_data[['Region']])
encoded_df = pd.DataFrame(values,columns=encoder.get_feature_names_out(['Region']))
customer_data = pd.concat([customer_data.drop('Region',axis=1), encoded_df],axis = 1)

In [170]:
customer_data.head()


Unnamed: 0,CustomerID,TotalSpent,Books_x,Clothing_x,Electronics_x,Home Decor_x,Books_y,Clothing_y,Electronics_y,Home Decor_y,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,114.6,0.0,942.433333,412.62,1,0,3,1,0.0,0.0,0.0,1.0
1,C0002,1862.74,0.0,512.73,0.0,418.64,0,2,0,2,1.0,0.0,0.0,0.0
2,C0003,2725.38,0.0,122.36,1385.2,608.91,0,1,1,2,0.0,0.0,0.0,1.0
3,C0004,5354.88,629.493333,0.0,677.87,703.553333,3,0,2,3,0.0,0.0,0.0,1.0
4,C0005,2034.24,0.0,0.0,590.19,853.86,0,0,2,1,1.0,0.0,0.0,0.0


## Scaling the aggregates
    -Min Max Scaler is used for scaling the aggregate data. Thus data scaling is done between the maximum and minimum by assaigning 1 to Maximum and 0 to Minimum

In [171]:
features = customer_data.drop(['CustomerID'],axis=1)
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(features)

# Fitting the seeds with a similarity function

In [172]:
similarity_matrix = cosine_similarity(scaled_features)#using cosine similarity


## Predicting look alikes
calculating top 3 look alikes for first 20 members i.e C001-C020

In [173]:
predictions = {}
for idx, cust_id in enumerate(customers['CustomerID'][:20]):
    score = similarity_matrix[idx]
    similar_indices = score.argsort()[::-1][1:4]  # Exclude self (highest similarity)
    prediction = [(customers['CustomerID'][i], score[i]) for i in similar_indices]
    predictions[cust_id] = prediction



In [174]:
print(predictions)

{'C0001': [('C0091', 0.9555186187196776), ('C0120', 0.9547711520124221), ('C0180', 0.9542143871795834)], 'C0002': [('C0134', 0.9837741875553733), ('C0106', 0.9658858481492147), ('C0159', 0.9390356095065153)], 'C0003': [('C0152', 0.967193439954941), ('C0031', 0.9651540773344783), ('C0163', 0.9544598049674475)], 'C0004': [('C0113', 0.9601084189019174), ('C0118', 0.9295909938306436), ('C0152', 0.9215619151860368)], 'C0005': [('C0007', 0.9875928878133334), ('C0110', 0.9139118415774714), ('C0080', 0.9125126704192681)], 'C0006': [('C0169', 0.9463171631893499), ('C0039', 0.9191938053497919), ('C0158', 0.9129523610580749)], 'C0007': [('C0005', 0.9875928878133334), ('C0080', 0.917968692667288), ('C0110', 0.9145694487873303)], 'C0008': [('C0098', 0.9470422554241033), ('C0024', 0.9322682261583559), ('C0059', 0.9220393507194821)], 'C0009': [('C0111', 0.9260557108495533), ('C0010', 0.922664610323015), ('C0062', 0.9201702410436783)], 'C0010': [('C0111', 0.9474402520034266), ('C0062', 0.9448530632360

Storing the prediction values in lookalike.csv

In [176]:
lookalike = [(cust_id,lookalike_id,score) 
             for cust_id, lookalike in predictions.items()
             for lookalike_id, score in lookalike]
lookalike_df = pd.DataFrame(lookalike, columns=["CustomerID","LookalikeID","Score"])
lookalike_df.to_csv('Madhav_NLV_Lookalike.csv', index=False)


## Results

In [177]:
result = pd.read_csv('Madhav_NLV_Lookalike.csv')
result.head(20)

Unnamed: 0,CustomerID,LookalikeID,Score
0,C0001,C0091,0.955519
1,C0001,C0120,0.954771
2,C0001,C0180,0.954214
3,C0002,C0134,0.983774
4,C0002,C0106,0.965886
5,C0002,C0159,0.939036
6,C0003,C0152,0.967193
7,C0003,C0031,0.965154
8,C0003,C0163,0.95446
9,C0004,C0113,0.960108
