In [43]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [44]:
# Step 1: Load Datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

In [45]:
# check few records of the dataset
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [46]:
products_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [47]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [48]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price            1000 non-null   float64
dtypes: float64(2), int64(1), object(4)
memory usage: 54.8+ KB


In [49]:
# Step 2: Preprocess Data
# Convert dates to datetime format
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

In [50]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   TransactionID    1000 non-null   object        
 1   CustomerID       1000 non-null   object        
 2   ProductID        1000 non-null   object        
 3   TransactionDate  1000 non-null   datetime64[ns]
 4   Quantity         1000 non-null   int64         
 5   TotalValue       1000 non-null   float64       
 6   Price            1000 non-null   float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 54.8+ KB


In [51]:
customers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   CustomerID    200 non-null    object        
 1   CustomerName  200 non-null    object        
 2   Region        200 non-null    object        
 3   SignupDate    200 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 6.4+ KB


In [52]:
#converted customer.signedupDate and transaction.Transactiondate into a proper datetime format for consistency.

In [53]:
# Aggregate transactional data for customer profiles
customer_transactions = transactions_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',       # Total spending
    'TransactionID': 'count',  # Number of transactions
    'ProductID': lambda x: x.nunique()  # Unique products purchased
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TransactionCount',
    'ProductID': 'UniqueProducts'
}).reset_index()

In [54]:
print(customer_transactions)

    CustomerID  TotalSpending  TransactionCount  UniqueProducts
0        C0001        3354.52                 5               5
1        C0002        1862.74                 4               4
2        C0003        2725.38                 4               4
3        C0004        5354.88                 8               8
4        C0005        2034.24                 3               3
..         ...            ...               ...             ...
194      C0196        4982.88                 4               3
195      C0197        1928.65                 3               3
196      C0198         931.83                 2               2
197      C0199        1979.28                 4               4
198      C0200        4758.60                 5               5

[199 rows x 4 columns]


In [56]:
# Merge customer profiles with customer data
customer_profiles = pd.merge(customers_df, customer_transactions, on='CustomerID', how='left')

In [59]:
print(customer_profiles)

    CustomerID        CustomerName         Region SignupDate  TotalSpending  \
0        C0001    Lawrence Carroll  South America 2022-07-10        3354.52   
1        C0002      Elizabeth Lutz           Asia 2022-02-13        1862.74   
2        C0003      Michael Rivera  South America 2024-03-07        2725.38   
3        C0004  Kathleen Rodriguez  South America 2022-10-09        5354.88   
4        C0005         Laura Weber           Asia 2022-08-15        2034.24   
..         ...                 ...            ...        ...            ...   
195      C0196         Laura Watts         Europe 2022-06-07        4982.88   
196      C0197    Christina Harvey         Europe 2023-03-21        1928.65   
197      C0198         Rebecca Ray         Europe 2022-02-27         931.83   
198      C0199      Andrea Jenkins         Europe 2022-12-03        1979.28   
199      C0200         Kelly Cross           Asia 2023-06-11        4758.60   

     TransactionCount  UniqueProducts  
0          

In [61]:
missing_values=customer_profiles.isnull().sum()
print(missing_values)

CustomerID          0
CustomerName        0
Region              0
SignupDate          0
TotalSpending       1
TransactionCount    1
UniqueProducts      1
dtype: int64


In [62]:
# Fill missing values for customers with no transactions
customer_profiles.fillna({
    'TotalSpending': 0,
    'TransactionCount': 0,
    'UniqueProducts': 0
}, inplace=True)

In [63]:
missing_values=customer_profiles.isnull().sum()
print(missing_values)

CustomerID          0
CustomerName        0
Region              0
SignupDate          0
TotalSpending       0
TransactionCount    0
UniqueProducts      0
dtype: int64


In [64]:
# Step 3: Normalize Numeric Features
scaler = MinMaxScaler()
numeric_features = ['TotalSpending', 'TransactionCount', 'UniqueProducts']
customer_profiles[numeric_features] = scaler.fit_transform(customer_profiles[numeric_features])

In [65]:
print(customer_profiles)

    CustomerID        CustomerName         Region SignupDate  TotalSpending  \
0        C0001    Lawrence Carroll  South America 2022-07-10       0.314274   
1        C0002      Elizabeth Lutz           Asia 2022-02-13       0.174514   
2        C0003      Michael Rivera  South America 2024-03-07       0.255332   
3        C0004  Kathleen Rodriguez  South America 2022-10-09       0.501681   
4        C0005         Laura Weber           Asia 2022-08-15       0.190581   
..         ...                 ...            ...        ...            ...   
195      C0196         Laura Watts         Europe 2022-06-07       0.466830   
196      C0197    Christina Harvey         Europe 2023-03-21       0.180689   
197      C0198         Rebecca Ray         Europe 2022-02-27       0.087300   
198      C0199      Andrea Jenkins         Europe 2022-12-03       0.185432   
199      C0200         Kelly Cross           Asia 2023-06-11       0.445818   

     TransactionCount  UniqueProducts  
0          

In [66]:
# Step 4: Compute Similarity Matrix
# Subset for first 20 customers
subset_profiles = customer_profiles.iloc[:20]
feature_matrix = subset_profiles[numeric_features].values
similarity_matrix = cosine_similarity(feature_matrix)

In [70]:
print(subset_profiles)

   CustomerID        CustomerName         Region SignupDate  TotalSpending  \
0       C0001    Lawrence Carroll  South America 2022-07-10       0.314274   
1       C0002      Elizabeth Lutz           Asia 2022-02-13       0.174514   
2       C0003      Michael Rivera  South America 2024-03-07       0.255332   
3       C0004  Kathleen Rodriguez  South America 2022-10-09       0.501681   
4       C0005         Laura Weber           Asia 2022-08-15       0.190581   
5       C0006     Brittany Palmer  South America 2024-01-07       0.396067   
6       C0007         Paul Graves           Asia 2022-06-18       0.241695   
7       C0008            David Li  North America 2024-01-13       0.400193   
8       C0009           Joy Clark         Europe 2023-08-14       0.083990   
9       C0010           Aaron Cox         Europe 2022-12-15       0.160912   
10      C0011       Bryan Mathews  South America 2022-12-12       0.349452   
11      C0012           Kevin May  South America 2024-08-07     

In [76]:
# Step 5: Generate Lookalike Recommendations
lookalike_results = {}
for idx, customer_id in enumerate(subset_profiles['CustomerID']):
    # Get similarity scores for the current customer
    scores = similarity_matrix[idx]
    #print(idx,customer_id,scores)
    # Sort by score (excluding self, idx) and get top 3
    similar_indices = np.argsort(scores)[::-1][1:4]
    similar_customers = [
        (subset_profiles.iloc[i]['CustomerID'], round(scores[i], 3)) 
        for i in similar_indices
    ]
    
    #print(similar_customers)
    lookalike_results[customer_id] = similar_customers
    
    
    

In [77]:
# Step 6: Save Results to Lookalike.csv
lookalike_data = [
    {"cust_id": cust_id, "List<cust_id,score>": str(similar_customers)}
    for cust_id, similar_customers in lookalike_results.items()
]

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_csv_path = 'Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path, index=False)

print(f"Lookalike results saved to {lookalike_csv_path}")


Lookalike results saved to Lookalike.csv


In [78]:
Lookalike_df = pd.read_csv('Lookalike.csv')

In [79]:
Lookalike_df.head(20)

Unnamed: 0,cust_id,"List<cust_id,score>"
0,C0001,"[('C0004', 1.0), ('C0005', 1.0), ('C0003', 1.0)]"
1,C0002,"[('C0010', 1.0), ('C0008', 1.0), ('C0019', 0.9..."
2,C0003,"[('C0005', 1.0), ('C0001', 1.0), ('C0004', 1.0)]"
3,C0004,"[('C0001', 1.0), ('C0005', 1.0), ('C0003', 1.0)]"
4,C0005,"[('C0003', 1.0), ('C0001', 1.0), ('C0004', 1.0)]"
5,C0006,"[('C0018', 0.999), ('C0020', 0.998), ('C0007',..."
6,C0007,"[('C0013', 1.0), ('C0020', 1.0), ('C0016', 0.9..."
7,C0008,"[('C0010', 1.0), ('C0002', 1.0), ('C0019', 0.9..."
8,C0009,"[('C0014', 1.0), ('C0019', 0.999), ('C0008', 0..."
9,C0010,"[('C0008', 1.0), ('C0002', 1.0), ('C0019', 0.9..."
