In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
customers = pd.read_csv('datasets/Customers.csv')
products = pd.read_csv('datasets/Products.csv')
transactions = pd.read_csv('datasets/Transactions.csv')

In [11]:
merged_df = transactions.merge(customers, on='CustomerID', how='left')
merged_df = merged_df.merge(products, on='ProductID', how='left')
merged_df.drop(['Price_x', 'Price_y'], inplace=True, axis=1)

In [12]:
merged_df.head() #this has combined information

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics


In [None]:
# Filter for CustomerIDs between C0001 and C0020
filtered_df = merged_df[merged_df['CustomerID'].isin([f'C{str(i).zfill(4)}' for i in range(1, 21)])]

# Preprocessing for the filtered DataFrame
filtered_df['TransactionDate'] = pd.to_datetime(filtered_df['TransactionDate']).view('int64') / 10**9  # Convert to timestamp in seconds
filtered_df['SignupDate'] = pd.to_datetime(filtered_df['SignupDate']).view('int64') / 10**9  # Convert to timestamp in seconds

In [14]:
filtered_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,CustomerName,Region,SignupDate,ProductName,Category
7,T00536,C0008,P067,1726986000.0,1,300.68,David Li,North America,1705104000.0,ComfortLiving Bluetooth Speaker,Electronics
32,T00605,C0017,P057,1703993000.0,4,958.8,Jennifer King,Europe,1701734000.0,ActiveWear Smartphone,Electronics
37,T00004,C0004,P049,1721387000.0,4,591.8,Kathleen Rodriguez,South America,1665274000.0,TechPro Textbook,Books
39,T00136,C0010,P049,1708627000.0,2,295.9,Aaron Cox,Europe,1671062000.0,TechPro Textbook,Books
44,T00551,C0018,P049,1708695000.0,3,443.85,Tyler Haynes,North America,1726877000.0,TechPro Textbook,Books


In [None]:
# Encoding categorical features
categorical_features = ['ProductID', 'Region', 'Category', 'ProductName']
label_encoders = {col: LabelEncoder().fit(filtered_df[col]) for col in categorical_features}
for col, le in label_encoders.items():
    filtered_df[col] = le.transform(filtered_df[col])

# Normalizing numerical features
scaler = MinMaxScaler()
numerical_features = ['TransactionDate', 'Quantity', 'TotalValue', 'SignupDate']
filtered_df[numerical_features] = scaler.fit_transform(filtered_df[numerical_features])

# Features into a feature matrix
feature_columns = ['ProductID', 'TransactionDate', 'Quantity', 'TotalValue', 'Region', 'SignupDate', 'ProductName', 'Category']
feature_matrix = filtered_df[feature_columns]

# Calculating cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)


In [16]:
# This dict stores (key: value), where each key is customerid from C0001-C0020
lookalike_dict = {}

for idx, customer_id in enumerate(filtered_df['CustomerID']):
    # Get similarity scores for the customer
    scores = similarity_matrix[idx]
    # Exclude the customer itself and sort by similarity score
    similar_customers = [(filtered_df['CustomerID'].iloc[i], scores[i]) for i in range(len(scores)) if i != idx]
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]
    lookalike_dict[customer_id] = similar_customers

# Convert the lookalike dictionary to a DataFrame for saving as CSV
lookalike_df = pd.DataFrame({
    'cust_id': lookalike_dict.keys(),
    'lookalikes': [str(value) for value in lookalike_dict.values()]
})

# Save to Lookalike.csv
csv_path = 'Mahesh_Reddy_Lookalike.csv'
lookalike_df.to_csv(csv_path, index=False)

print(f'Lookalike model for C0001 - C0020 saved to {csv_path}')


Lookalike model for C0001 - C0020 saved to Mahesh_Reddy_Lookalike.csv
