In [1]:
! pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer,  LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers_df = pd.read_csv('dataset/Customers.csv')
products_df = pd.read_csv('dataset/Products.csv')
transactions_df = pd.read_csv('dataset/Transactions.csv')

In [4]:
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [5]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
products_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [7]:
df = pd.merge(transactions_df, customers_df[['CustomerID', 'Region']], on='CustomerID', how='inner')
df = pd.merge(df, products_df[['ProductID', 'ProductName', 'Category']], on='ProductID', how='inner')
df

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Region,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Asia,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,South America,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics
...,...,...,...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86,South America,SoundWave Smartwatch,Electronics
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86,North America,SoundWave Smartwatch,Electronics
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86,North America,SoundWave Smartwatch,Electronics
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86,Asia,SoundWave Smartwatch,Electronics


In [8]:
df[df['CustomerID']=='C0199']

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Region,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Europe,ComfortLiving Bluetooth Speaker,Electronics
439,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,Europe,HomeSense Wall Art,Home Decor
918,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,Europe,ActiveWear Rug,Home Decor
940,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,Europe,BookWorld Bluetooth Speaker,Electronics


In [9]:
df[df['CustomerID']=='C0001']

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,Region,ProductName,Category
135,T00015,C0001,P054,2024-01-19 03:12:55,2,114.6,57.3,South America,SoundWave Cookbook,Books
442,T00932,C0001,P022,2024-09-17 09:01:18,3,412.62,137.54,South America,HomeSense Wall Art,Home Decor
546,T00085,C0001,P096,2024-04-08 00:01:00,2,614.94,307.47,South America,SoundWave Headphones,Electronics
724,T00445,C0001,P083,2024-05-07 03:11:44,2,911.44,455.72,South America,ActiveWear Smartwatch,Electronics
776,T00436,C0001,P029,2024-11-02 17:04:16,3,1300.92,433.64,South America,TechPro Headphones,Electronics


In [10]:
def safe_mode(series):
    mode = series.mode()
    return mode.tolist() if len(mode) > 0 else []

In [11]:
df = df.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'ProductID': pd.Series.nunique,
    'Category': safe_mode,
    'Region': lambda x: x.unique()[0]
}).reset_index()
df.head()

Unnamed: 0,CustomerID,TotalValue,TransactionID,ProductID,Category,Region
0,C0001,3354.52,5,5,[Electronics],South America
1,C0002,1862.74,4,4,"[Clothing, Home Decor]",Asia
2,C0003,2725.38,4,4,[Home Decor],South America
3,C0004,5354.88,8,8,"[Books, Home Decor]",South America
4,C0005,2034.24,3,3,[Electronics],Asia


In [12]:
df.rename(columns={
    'TotalValue': 'TotalSpend',
    'TransactionID': 'NumTransactions',
    'ProductID': 'NumUniqueProducts',
}, inplace=True)
df.head()

Unnamed: 0,CustomerID,TotalSpend,NumTransactions,NumUniqueProducts,Category,Region
0,C0001,3354.52,5,5,[Electronics],South America
1,C0002,1862.74,4,4,"[Clothing, Home Decor]",Asia
2,C0003,2725.38,4,4,[Home Decor],South America
3,C0004,5354.88,8,8,"[Books, Home Decor]",South America
4,C0005,2034.24,3,3,[Electronics],Asia


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         199 non-null    object 
 1   TotalSpend         199 non-null    float64
 2   NumTransactions    199 non-null    int64  
 3   NumUniqueProducts  199 non-null    int64  
 4   Category           199 non-null    object 
 5   Region             199 non-null    object 
dtypes: float64(1), int64(2), object(3)
memory usage: 9.5+ KB


In [14]:
label_encoder = LabelEncoder()
df['Region'] = label_encoder.fit_transform(df['Region'])

In [15]:
df.head()

Unnamed: 0,CustomerID,TotalSpend,NumTransactions,NumUniqueProducts,Category,Region
0,C0001,3354.52,5,5,[Electronics],3
1,C0002,1862.74,4,4,"[Clothing, Home Decor]",0
2,C0003,2725.38,4,4,[Home Decor],3
3,C0004,5354.88,8,8,"[Books, Home Decor]",3
4,C0005,2034.24,3,3,[Electronics],0


In [16]:
mlb = MultiLabelBinarizer()
category_encoded = mlb.fit_transform(df['Category'])
category_labels = mlb.classes_
category_labels

array(['Books', 'Clothing', 'Electronics', 'Home Decor'], dtype=object)

In [17]:
category_df = pd.DataFrame(category_encoded, columns=category_labels, index=df.index)
df = pd.concat([df.drop('Category', axis=1), category_df], axis=1)
df.head()

Unnamed: 0,CustomerID,TotalSpend,NumTransactions,NumUniqueProducts,Region,Books,Clothing,Electronics,Home Decor
0,C0001,3354.52,5,5,3,0,0,1,0
1,C0002,1862.74,4,4,0,0,1,0,1
2,C0003,2725.38,4,4,3,0,0,0,1
3,C0004,5354.88,8,8,3,1,0,0,1
4,C0005,2034.24,3,3,0,0,0,1,0


In [18]:
scaler = StandardScaler()
numerical_features = ['TotalSpend', 'NumTransactions', 'NumUniqueProducts']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
df.head()

Unnamed: 0,CustomerID,TotalSpend,NumTransactions,NumUniqueProducts,Region,Books,Clothing,Electronics,Home Decor
0,C0001,-0.061701,-0.011458,0.050047,3,0,0,1,0
1,C0002,-0.877744,-0.467494,-0.424204,0,0,1,0,1
2,C0003,-0.405857,-0.467494,-0.424204,3,0,0,0,1
3,C0004,1.032547,1.35665,1.472798,3,1,0,0,1
4,C0005,-0.783929,-0.92353,-0.898455,0,0,0,1,0


In [19]:
feature_matrix = df.drop(['CustomerID'], axis=1)
feature_matrix.head()

Unnamed: 0,TotalSpend,NumTransactions,NumUniqueProducts,Region,Books,Clothing,Electronics,Home Decor
0,-0.061701,-0.011458,0.050047,3,0,0,1,0
1,-0.877744,-0.467494,-0.424204,0,0,1,0,1
2,-0.405857,-0.467494,-0.424204,3,0,0,0,1
3,1.032547,1.35665,1.472798,3,1,0,0,1
4,-0.783929,-0.92353,-0.898455,0,0,0,1,0


In [20]:
similarity_matrix = cosine_similarity(feature_matrix)

In [21]:
input_customer_id = 'C0001' 
customer_idx = df[df['CustomerID'] == input_customer_id].index[0]
similarities = similarity_matrix[customer_idx]
sorted_indices = similarities.argsort()[::-1][1:4]
similar_customer_ids = df.iloc[sorted_indices]['CustomerID'].tolist()
similarity_scores = similarities[sorted_indices].tolist()

for customer_id, score in zip(similar_customer_ids, similarity_scores):
    print(f"Customer ID: {customer_id}, Similarity Score: {score:.3f}")


Customer ID: C0190, Similarity Score: 0.998
Customer ID: C0048, Similarity Score: 0.996
Customer ID: C0091, Similarity Score: 0.989


In [22]:
lookalike_data = []

for customer_id in df['CustomerID'][:20]:
    customer_idx = df[df['CustomerID'] == customer_id].index[0]
    similarities = similarity_matrix[customer_idx]
    sorted_indices = similarities.argsort()[::-1][1:4] 
    similar_customer_ids = df.iloc[sorted_indices]['CustomerID'].tolist()
    similarity_scores = similarities[sorted_indices].tolist()
    similar_customers_dict = {similar_customer_ids[i]: round(similarity_scores[i], 3) for i in range(len(similar_customer_ids))}
    lookalike_data.append([customer_id, str(similar_customers_dict)])


In [23]:
lookalike_data[:2]

[['C0001', "{'C0190': 0.998, 'C0048': 0.996, 'C0091': 0.989}"],
 ['C0002', "{'C0043': 0.881, 'C0106': 0.87, 'C0178': 0.79}"]]

In [24]:
formatted_data = {}

for customer_id, similar_customers_str in lookalike_data:
    similar_customers_dict = eval(similar_customers_str)
    formatted_data[customer_id] = [{"cust_id": k, "score": v} for k, v in similar_customers_dict.items()]

In [25]:
formatted_data

{'C0001': [{'cust_id': 'C0190', 'score': 0.998},
  {'cust_id': 'C0048', 'score': 0.996},
  {'cust_id': 'C0091', 'score': 0.989}],
 'C0002': [{'cust_id': 'C0043', 'score': 0.881},
  {'cust_id': 'C0106', 'score': 0.87},
  {'cust_id': 'C0178', 'score': 0.79}],
 'C0003': [{'cust_id': 'C0031', 'score': 0.993},
  {'cust_id': 'C0052', 'score': 0.99},
  {'cust_id': 'C0158', 'score': 0.986}],
 'C0004': [{'cust_id': 'C0113', 'score': 0.959},
  {'cust_id': 'C0104', 'score': 0.956},
  {'cust_id': 'C0122', 'score': 0.935}],
 'C0005': [{'cust_id': 'C0186', 'score': 0.997},
  {'cust_id': 'C0007', 'score': 0.987},
  {'cust_id': 'C0146', 'score': 0.948}],
 'C0006': [{'cust_id': 'C0011', 'score': 0.976},
  {'cust_id': 'C0171', 'score': 0.968},
  {'cust_id': 'C0137', 'score': 0.968}],
 'C0007': [{'cust_id': 'C0005', 'score': 0.987},
  {'cust_id': 'C0115', 'score': 0.984},
  {'cust_id': 'C0186', 'score': 0.974}],
 'C0008': [{'cust_id': 'C0124', 'score': 0.951},
  {'cust_id': 'C0109', 'score': 0.93},
  {'c

In [26]:
lookalike_df = pd.DataFrame()
lookalikes = []
for key, val in formatted_data.items():
    item = key + ":" + str(val)
    lookalikes.append(item)
lookalike_df['Lookalikes'] = lookalikes
lookalike_df.head()

Unnamed: 0,Lookalikes
0,"C0001:[{'cust_id': 'C0190', 'score': 0.998}, {..."
1,"C0002:[{'cust_id': 'C0043', 'score': 0.881}, {..."
2,"C0003:[{'cust_id': 'C0031', 'score': 0.993}, {..."
3,"C0004:[{'cust_id': 'C0113', 'score': 0.959}, {..."
4,"C0005:[{'cust_id': 'C0186', 'score': 0.997}, {..."


In [27]:
lookalike_df.to_csv('Lookalikes.csv', index=False)

In [28]:
lookalike_df.head()

Unnamed: 0,Lookalikes
0,"C0001:[{'cust_id': 'C0190', 'score': 0.998}, {..."
1,"C0002:[{'cust_id': 'C0043', 'score': 0.881}, {..."
2,"C0003:[{'cust_id': 'C0031', 'score': 0.993}, {..."
3,"C0004:[{'cust_id': 'C0113', 'score': 0.959}, {..."
4,"C0005:[{'cust_id': 'C0186', 'score': 0.997}, {..."
