## Building a Lookalike Model

In [50]:
# Import packages
import pandas as pd
import numpy as np
from collections import defaultdict

For calculating the similarity of two customers, I first pre-processed:
* Average of TotalValue spent by customer using Transactions table
* Count product Category count for each customer using the ProductID in Transactions table
* Classify customers as Old or New based on whether they Signup to platform before '2024-01-01' using Customers table

In [53]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Calculate average TotalValue per customer
customer_transactions = transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'ProductID': 'count'}).reset_index()
customer_transactions = customer_transactions.rename(columns={'ProductID': 'NumPurchases'})

transactions = transactions.merge(products[['ProductID', 'Category']], on='ProductID', how='left')
category_per_customer = transactions.groupby('CustomerID')['Category'].apply(lambda x: set(x)).reset_index()
category_per_customer = category_per_customer.rename(columns={'Category': 'Categories'})
customers_data = customers.merge(customer_transactions, on='CustomerID', how='left')
customers_data = customers_data.merge(category_per_customer, on='CustomerID', how='left')

# Classify customers as Old and New
customers_data['SignupDate'] = pd.to_datetime(customers_data['SignupDate'])
customers_data['CustomerType'] = customers_data['SignupDate'].apply(lambda x: 'Old' if x < pd.Timestamp('2024-01-01') else 'New')

# Calculate Category count per customer
category_counts_per_customer = transactions.groupby(['CustomerID', 'Category'])['ProductID'].count().reset_index()
category_counts_per_customer = category_counts_per_customer.rename(columns={'ProductID': 'CategoryCounts'})
category_per_customer = category_counts_per_customer.groupby('CustomerID').agg(
    {'Category': lambda x: list(x), 'CategoryCounts': lambda x: list(x)}).reset_index()
customers_data = customers_data.merge(category_per_customer, on='CustomerID', how='left')


For calculating similarity score out of 5, both personal and product info is used:
* Region:  
    * Increment total score by 1 if both customers belong to same region
* Signup Date:
    * Increment total score by 1 if both customers are classified as same (if both are Old or both are New)
* Average TotalValue spent:
    * For calculating similarity score purposes, the difference between the TotalValue is considered and finally increment ranges from [0,1].
* Top 2 Product Categories for the customer:
    * Increment total score by 2, if both the top categories match.
    * Increment total score by 1, if any one of the top categories match.

In [54]:
def get_top_categories(categories, category_counts, top_n=2):
    category_count_dict = dict(zip(categories, category_counts))
    sorted_categories = sorted(category_count_dict.items(), key=lambda x: x[1], reverse=True)
    top_categories = [item[0] for item in sorted_categories[:top_n]]
    return top_categories

# Calculate similarity score out of 5
def calculate_similarity(customer1, customer2):
    score = 0
    # Use customer information
    score += 1 if customer1['Region'] == customer2['Region'] else 0
    score += 1 if customer1['CustomerType'] == customer2['CustomerType'] else 0
    # Use product information
    avg_spend_diff = abs(customer1['TotalValue'] - customer2['TotalValue'])
    score += max(0, 1 - (avg_spend_diff / max(customer1['TotalValue'], customer2['TotalValue'])))

    if pd.isnull(customer1['Categories']) or pd.isnull(customer2['Categories']):
        category_similarity = 0
    else:
        # Get the top 2 categories for both customers
        customer1_top_categories = get_top_categories(customer1['Categories'], customer1['CategoryCounts'], top_n=2)
        customer2_top_categories = get_top_categories(customer2['Categories'], customer2['CategoryCounts'], top_n=2)
        matching_categories = len(set(customer1_top_categories).intersection(set(customer2_top_categories)))
        score += matching_categories

    return score

* Using nested loop where the outer loop runs for first 20 customers and the inner loop runs through all the 1000 transactions, we store similarity scores corresponding to each customer and then sort it to find the top 3 similar customers.
* Store all the elements of the map to one dataframe and corresponding csv format.

In [56]:
lookalike_dict = defaultdict(list)
for i, customer in customers_data.head(20).iterrows():
    customer_id = customer['CustomerID']
    similarities = []
    for j, other_customer in customers_data.iterrows():
        if customer_id != other_customer['CustomerID']:
            similarity_score = calculate_similarity(customer, other_customer)
            similarities.append((other_customer['CustomerID'], similarity_score))
    # Sort by similarity score and get top 3
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_3 = similarities[:3]
    lookalike_dict[customer_id] = top_3

# Save to csv
lookalike_list = []

for cust_id, recommendations in lookalike_dict.items():
    for rec in recommendations:
        lookalike_list.append([cust_id, rec[0], rec[1]])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Pankti_Salvi_Lookalike.csv', index=False)
print("Lookalike model saved to csv")

Lookalike model saved to csv
