Load Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

Load Dataset

In [2]:
customer = pd.read_csv('/content/Customers.csv')
product = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

In [3]:
customer.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
product.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [5]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


Merge Datasets

In [6]:
data = transactions.merge(customer, on='CustomerID',how='left').merge(product, on='ProductID',how ='left')


In [7]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [8]:
data = data.drop(columns=['Price_y'])
data = data.rename(columns={'Price_x': 'Price'})

In [9]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price            1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
dtypes: float64(2), int64(1), object(9)
memory usage: 93.9+ KB


In [11]:
data.describe()

Unnamed: 0,Quantity,TotalValue,Price
count,1000.0,1000.0,1000.0
mean,2.537,689.99556,272.55407
std,1.117981,493.144478,140.73639
min,1.0,16.08,16.08
25%,2.0,295.295,147.95
50%,3.0,588.88,299.93
75%,4.0,1011.66,404.4
max,4.0,1991.04,497.76


In [12]:
data.isnull().sum()

Unnamed: 0,0
TransactionID,0
CustomerID,0
ProductID,0
TransactionDate,0
Quantity,0
TotalValue,0
Price,0
CustomerName,0
Region,0
SignupDate,0


Calculate category wise quantities

In [13]:

category_quantity = data.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack(fill_value=0).reset_index()

In [14]:
category_quantity.head()

Category,CustomerID,Books,Clothing,Electronics,Home Decor
0,C0001,2,0,7,3
1,C0002,0,4,0,6
2,C0003,0,4,4,6
3,C0004,8,0,6,9
4,C0005,0,0,4,3


General Customer features

In [15]:
general_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'TransactionID': 'count',
    'Region': lambda x: x.mode()[0]
}).reset_index()

In [16]:
general_features.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,TransactionID,Region
0,C0001,3354.52,12,5,South America
1,C0002,1862.74,10,4,Asia
2,C0003,2725.38,14,4,South America
3,C0004,5354.88,23,8,South America
4,C0005,2034.24,7,3,Asia


In [17]:
customer_features = general_features.merge(category_quantity, on='CustomerID', how='left')
customer_features = customer_features.rename(columns={'Quantity': 'TotalQuantity'})

In [18]:
customer_features.head()

Unnamed: 0,CustomerID,TotalValue,TotalQuantity,TransactionID,Region,Books,Clothing,Electronics,Home Decor
0,C0001,3354.52,12,5,South America,2,0,7,3
1,C0002,1862.74,10,4,Asia,0,4,0,6
2,C0003,2725.38,14,4,South America,0,4,4,6
3,C0004,5354.88,23,8,South America,8,0,6,9
4,C0005,2034.24,7,3,Asia,0,0,4,3


One Hot Encoding on 'Region' feature

In [19]:
customer_features= pd.get_dummies(customer_features, columns=['Region'])


In [20]:
customer_features.head()

Unnamed: 0,CustomerID,TotalValue,TotalQuantity,TransactionID,Books,Clothing,Electronics,Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,3354.52,12,5,2,0,7,3,False,False,False,True
1,C0002,1862.74,10,4,0,4,0,6,True,False,False,False
2,C0003,2725.38,14,4,0,4,4,6,False,False,False,True
3,C0004,5354.88,23,8,8,0,6,9,False,False,False,True
4,C0005,2034.24,7,3,0,0,4,3,True,False,False,False


In [21]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CustomerID            199 non-null    object 
 1   TotalValue            199 non-null    float64
 2   TotalQuantity         199 non-null    int64  
 3   TransactionID         199 non-null    int64  
 4   Books                 199 non-null    int64  
 5   Clothing              199 non-null    int64  
 6   Electronics           199 non-null    int64  
 7   Home Decor            199 non-null    int64  
 8   Region_Asia           199 non-null    bool   
 9   Region_Europe         199 non-null    bool   
 10  Region_North America  199 non-null    bool   
 11  Region_South America  199 non-null    bool   
dtypes: bool(4), float64(1), int64(6), object(1)
memory usage: 13.3+ KB


Normalize Features

In [22]:
columns_to_scale = customer_features.select_dtypes(include=['float64', 'int64']).columns

In [23]:
scaler = StandardScaler()
customer_features[columns_to_scale] = scaler.fit_transform(customer_features[columns_to_scale])
customer_features.head()

Unnamed: 0,CustomerID,TotalValue,TotalQuantity,TransactionID,Books,Clothing,Electronics,Home Decor,Region_Asia,Region_Europe,Region_North America,Region_South America
0,C0001,-0.061701,-0.122033,-0.011458,-0.464594,-0.963893,1.255863,-0.069051,False,False,False,True
1,C0002,-0.877744,-0.448,-0.467494,-1.117981,0.336546,-1.027971,0.912454,True,False,False,False
2,C0003,-0.405857,0.203934,-0.467494,-1.117981,0.336546,0.277077,0.912454,False,False,False,True
3,C0004,1.032547,1.670787,1.35665,1.495566,-0.963893,0.929601,1.893958,False,False,False,True
4,C0005,-0.783929,-0.936951,-0.92353,-1.117981,-0.963893,0.277077,-0.069051,True,False,False,False


We use the K-Nearest Neighbors (KNN) algorithm to find similar customers.
We set k=3 to find 3 nearest neighbors for each customer. The 'metric' used is Euclidean distance.

In [24]:
k = 3
knn = NearestNeighbors(n_neighbors=k + 1, metric='manhattan')
knn.fit(customer_features.drop('CustomerID', axis=1))

In [25]:
distances, indices = knn.kneighbors(customer_features.drop('CustomerID', axis=1))

Lookalike Mapping for first 20 customers

In [26]:
lookalikes = {}
for idx, neighbors in enumerate(indices[:20]):
    similar_customers = [
        (customer_features['CustomerID'][i], round(distances[idx][j], 2))
        for j, i in enumerate(neighbors) if i != idx  # Exclude the customer itself
    ][:3]
    lookalikes[customer_features['CustomerID'][idx]] = similar_customers

In [27]:
print(lookalikes)

{'C0001': [('C0048', 2.88), ('C0152', 2.96), ('C0181', 3.23)], 'C0002': [('C0159', 1.55), ('C0178', 1.6), ('C0133', 3.05)], 'C0003': [('C0133', 2.21), ('C0031', 2.35), ('C0158', 2.99)], 'C0004': [('C0113', 2.75), ('C0012', 3.14), ('C0148', 3.87)], 'C0005': [('C0007', 0.79), ('C0140', 1.52), ('C0146', 2.55)], 'C0006': [('C0187', 2.46), ('C0048', 2.62), ('C0158', 3.06)], 'C0007': [('C0005', 0.79), ('C0140', 2.3), ('C0146', 2.42)], 'C0008': [('C0093', 4.81), ('C0046', 5.21), ('C0067', 5.51)], 'C0009': [('C0198', 0.48), ('C0061', 2.8), ('C0014', 3.02)], 'C0010': [('C0111', 3.86), ('C0176', 4.1), ('C0061', 4.27)], 'C0011': [('C0107', 1.85), ('C0190', 2.2), ('C0048', 2.51)], 'C0012': [('C0113', 2.8), ('C0148', 2.83), ('C0004', 3.14)], 'C0013': [('C0099', 3.62), ('C0155', 4.09), ('C0188', 4.49)], 'C0014': [('C0060', 0.13), ('C0198', 2.59), ('C0097', 2.59)], 'C0015': [('C0123', 2.13), ('C0131', 2.36), ('C0144', 2.69)], 'C0016': [('C0117', 1.52), ('C0029', 2.9), ('C0183', 2.95)], 'C0017': [('C0

In [28]:
lookalike_df = pd.DataFrame({
    'cust_id': lookalikes.keys(),
    'similar_customers': [str(v) for v in lookalikes.values()]
})

In [29]:
lookalike_df.head()

Unnamed: 0,cust_id,similar_customers
0,C0001,"[('C0048', 2.88), ('C0152', 2.96), ('C0181', 3..."
1,C0002,"[('C0159', 1.55), ('C0178', 1.6), ('C0133', 3...."
2,C0003,"[('C0133', 2.21), ('C0031', 2.35), ('C0158', 2..."
3,C0004,"[('C0113', 2.75), ('C0012', 3.14), ('C0148', 3..."
4,C0005,"[('C0007', 0.79), ('C0140', 1.52), ('C0146', 2..."


In [30]:
lookalike_df.to_csv('Lookalike.csv', index=False)