In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [4]:
# Load data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [19]:
products.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [18]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [5]:
# Preprocess Data
# Aggregate transaction history for customers
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

In [12]:
product_features = transactions.merge(products, on='ProductID', how='inner')



In [22]:
# product_features.columns
product_features = product_features.drop(columns=['Price_x'])  # Drop 'Price_x'
product_features = product_features.rename(columns={'Price_y': 'Price'})  # Rename 'Price_y' to 'Price'

In [23]:
# Aggregate product-level data per customer
customer_product_data = product_features.groupby('CustomerID').agg({
    'Price': ['mean', 'sum'],  # Average and total price of products purchased
    'Category': lambda x: x.nunique()  # Number of unique categories purchased
}).reset_index()


In [24]:
customer_product_data.columns = ['CustomerID', 'AvgProductPrice', 'TotalProductPrice', 'UniqueCategories']

In [25]:
# Merge with customer data
customer_profiles = pd.merge(customers, customer_transactions, on='CustomerID', how='left')
customer_profiles = pd.merge(customer_profiles, customer_product_data, on='CustomerID', how='left')

In [26]:
# Fill missing values
customer_profiles['TotalValue'] = customer_profiles['TotalValue'].fillna(0)
customer_profiles['Quantity'] = customer_profiles['Quantity'].fillna(0)
customer_profiles['AvgProductPrice'] = customer_profiles['AvgProductPrice'].fillna(0)
customer_profiles['TotalProductPrice'] = customer_profiles['TotalProductPrice'].fillna(0)
customer_profiles['UniqueCategories'] = customer_profiles['UniqueCategories'].fillna(0)

In [27]:
# Encode categorical data (Region)
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'], drop_first=True)

In [28]:
# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['TotalValue', 'Quantity', 'AvgProductPrice', 'TotalProductPrice', 'UniqueCategories']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

In [29]:
# Calculate similarity matrix
customer_matrix = customer_profiles.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)
similarity_matrix = cosine_similarity(customer_matrix)

In [30]:
# Generate lookalikes for the first 20 customers
lookalike_map = {}
customer_ids = customer_profiles['CustomerID'][:20]

In [31]:
# Top 3 Lookalikes
for idx, cust_id in enumerate(customer_ids):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_lookalikes = [(customer_profiles['CustomerID'][i], score) for i, score in sorted_scores[1:4]]  # Top 3 excluding self
    lookalike_map[cust_id] = top_lookalikes

In [32]:
# Create Lookalike.csv
lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():
    for lookalike_id, score in lookalikes:
        lookalike_list.append({'cust_id': cust_id, 'lookalike_id': lookalike_id, 'score': score})

In [33]:
lookalike_df = pd.DataFrame(lookalike_list)

In [34]:
lookalike_df.to_csv('Lookalike.csv', index=False)

In [35]:
print("Lookalike model completed using both customer and product information. Results saved in Lookalike.csv.")

Lookalike model completed using both customer and product information. Results saved in Lookalike.csv.
