Import Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


Load and Prepare Data

In [3]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [4]:
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')
full_data = pd.merge(transactions_products, customers, on='CustomerID', how='left')

In [5]:
customer_profiles = full_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Total and average spend
    'TransactionID': 'count',       # Total transaction count
    'Category': lambda x: x.mode()[0],  # Most preferred category
    'TransactionDate': 'max'        # Last transaction date for recency
}).reset_index()

In [6]:
customer_profiles.columns = ['CustomerID', 'TotalSpend', 'AverageSpend', 'TransactionCount', 'PreferredCategory', 'LastTransactionDate']
customer_profiles['LastTransactionDate'] = pd.to_datetime(customer_profiles['LastTransactionDate'])
customer_profiles['Recency'] = (pd.to_datetime('today') - customer_profiles['LastTransactionDate']).dt.days
customer_profiles = pd.get_dummies(customer_profiles, columns=['PreferredCategory'], prefix='Category')


In [7]:
customer_profiles['Frequency'] = customer_profiles['TransactionCount'] / customer_profiles['Recency']

Compute Similarity

In [8]:
scaler = StandardScaler()
features = customer_profiles.drop(['CustomerID', 'LastTransactionDate'], axis=1)
scaled_features = scaler.fit_transform(features)


In [9]:
pca = PCA(n_components=0.95)  
reduced_features = pca.fit_transform(scaled_features)

In [10]:
similarity_matrix = cosine_similarity(reduced_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles['CustomerID'], columns=customer_profiles['CustomerID'])


In [11]:
lookalike_results = {}

for customer_id in customer_profiles['CustomerID'][:20]:  # First 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Top 3 lookalikes
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))


Save to CSV

In [12]:
lookalike_df = pd.DataFrame({
    'cust_id': lookalike_results.keys(),
    'lookalike_list': lookalike_results.values()
})
lookalike_df.to_csv('LookalikeImproved.csv', index=False)

In [13]:
print("Lookalike recommendations saved in 'LookalikeImproved.csv'")
lookalike_df.head()

Lookalike recommendations saved in 'LookalikeImproved.csv'


Unnamed: 0,cust_id,lookalike_list
0,C0001,"[(C0072, 0.9863938573024578), (C0039, 0.948284..."
1,C0002,"[(C0010, 0.9894837994036254), (C0029, 0.966585..."
2,C0003,"[(C0052, 0.9929435059730934), (C0160, 0.969243..."
3,C0004,"[(C0101, 0.9977380282922208), (C0075, 0.985137..."
4,C0005,"[(C0186, 0.985385214782424), (C0095, 0.9657818..."
