In [27]:
import pandas as pd

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [28]:
# Merging transactions with products
transactions_with_category = transactions.merge(
    products[['ProductID', 'Category']], 
    on='ProductID', 
    how='left'
)

In [29]:
# Aggregating sales by product category for each customer
customer_category_sales = transactions_with_category.groupby(['CustomerID', 'Category']).agg(
    category_spend=('TotalValue', 'sum')
).reset_index()

In [30]:
# Pivot the data to get each category as a separate column for each customer
customer_category_sales = customer_category_sales.pivot_table(
    index='CustomerID', 
    columns='Category', 
    values='category_spend', 
    fill_value=0
)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

# Standardize the customer category spending data
scaler = StandardScaler()
category_sales_scaled = scaler.fit_transform(customer_category_sales)

In [32]:
# Calculating cosine similarity between customers based on category spend
cosine_sim = cosine_similarity(category_sales_scaled)

In [33]:
# Preparing a DataFrame with similarity scores
similarity_df = pd.DataFrame(cosine_sim, index=customer_category_sales.index, columns=customer_category_sales.index)

In [34]:
# Get the top 3 similar customers for the first 20 customers
lookalike_dict = {}
for customer_id in customer_category_sales.index[:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalike_dict[customer_id] = [(cust_id, score) for cust_id, score in zip(similar_customers.index, similar_customers.values)]

In [35]:
# Converting the lookalike dictionary to a DataFrame
lookalike_data = []
for customer_id, similar_customers in lookalike_dict.items():
    for similar_customer in similar_customers:
        lookalike_data.append([customer_id, similar_customer[0], similar_customer[1]])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])

In [36]:
# Saving the result to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

In [37]:
# Output the first few rows of the Lookalike DataFrame
print(lookalike_df.head())

  CustomerID LookalikeID  SimilarityScore
0      C0001       C0091         0.988848
1      C0001       C0069         0.984344
2      C0001       C0184         0.978562
3      C0002       C0159         0.979511
4      C0002       C0036         0.956762
