In [56]:
import pandas as pd 
import torch 
import torch.nn as nn 
from torch.nn import functional as F


customers_df = pd.read_csv('data/Customers.csv')
products_df = pd.read_csv('data/Products.csv')
transactions_df = pd.read_csv('data/Transactions.csv')

# Ṃerge all the three datasets and remove the common column
data = transactions_df.merge(customers_df, on='CustomerID').merge(products_df, on='ProductID')
data = data.rename(columns={'Price_x':"Price"}).drop('Price_y', axis=1)

customers = customers_df['CustomerID'].unique().tolist()

In [57]:
# This function is to calculate the perecentage of category bought by each individual customer.

def calculate_category_preferences(data):
    category_quantities = data.groupby(['CustomerID', 'Category'])['Quantity'].sum().reset_index()
    customer_totals = data.groupby('CustomerID')['Quantity'].sum().reset_index()
    categories, customers = data['Category'].unique(), data['CustomerID'].unique()
    
    category_dict = {}
    for customer in customers:
        customer_data = category_quantities[category_quantities['CustomerID'] == customer]
        customer_total = customer_totals[customer_totals['CustomerID'] == customer]['Quantity'].iloc[0]
        
        category_dict[customer] = {}
        for category in categories:
            category_row = customer_data[customer_data['Category'] == category]
            
            quantity = category_row['Quantity'].iloc[0] if len(category_row) else 0

            category_dict[customer][category] = quantity / customer_total

    return pd.DataFrame.from_dict(category_dict, orient='index')


In [58]:
def extract_features(data):
    
    # First create customer features by aggregation
    customer_features = data.groupby('CustomerID').agg({
        'TotalValue': ['mean', 'sum'],  #Avg spend and total spend
        'Quantity': ['mean', 'sum'],    #Avg quantity and total quantitty
        'TransactionID': 'count',       #Total txns
        'TransactionDate': [
            lambda x: (pd.to_datetime('today') - pd.to_datetime(x.max())).days,  #Time since last purchase
            lambda x: (pd.to_datetime(x.max()) - pd.to_datetime(x.min())).days   #Purchase range
        ]
    }).reset_index()

    numerical_cols = [
        'avg_order_value', 'total_spend', 'avg_quantity',
        'total_quantity', 'transaction_count', 'days_since_last',
        'purchase_timespan'
    ]
    customer_features.columns = ['CustomerID', *numerical_cols]
    
    # Extract categorical features
    category_preferences = calculate_category_preferences(data)

    #  Get regional features by one-hot encoding.
    region_dummies = pd.get_dummies(
        customers_df[customers_df['CustomerID'].isin(customer_features['CustomerID'])]['Region']
    )
    region_features = torch.tensor(
        region_dummies.values,
        dtype=torch.float32
    )

    # Convert all features to tensor format and normalize the numerical features.
    #  (Z score normalization) - converts all features into the same scale.
    numerical_features = torch.tensor(
    customer_features[numerical_cols].values, 
        dtype=torch.float32
    )
    numerical_features = (numerical_features - numerical_features.mean(dim=0)) / (
        numerical_features.std(dim=0) + 1e-7
    )

    category_features = torch.tensor(
        category_preferences.values, 
        dtype=torch.float32
    )

    # Concat all 4 feature arrays into a single array
    all_features = torch.cat([
        numerical_features, 
        category_features,   
        region_features     
    ], dim=1)
    
    return all_features

In [60]:
# A basic neural network consisting of 2 linear layers.
class SimilarityModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32)
        )
        
    def forward(self, features):
        return F.normalize(self.layers(features), p=2, dim=1)

In [59]:
features = extract_features(data)

In [61]:
model = SimilarityModel(input_dim=features.shape[1])
    
with torch.no_grad():
    embeddings = model(features)

similarity_matrix = torch.mm(embeddings, embeddings.t())
_, top_indices = torch.topk(similarity_matrix, k=4, dim=1)
# here top k = 4 returns top 4 similar results (1 is self, and 3 are the others)

# get top similarities for first 20 customers.
lookalike_results = {}
for i in range(20):
    similar_indices = top_indices[i][1:4].numpy()  # Skip self, get next 3
    similarities = similarity_matrix[i][similar_indices].numpy()
    
    similar_customers = [
        {'CustomerID': customers[idx], 'Score': float(score)}
        for idx, score in zip(similar_indices, similarities)
    ]
    lookalike_results[customers[i]] = similar_customers

lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': lookalike_results.values()
})
lookalike_df.to_csv('Lookaliketorch.csv', index=False)