In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load datasets
transactions = pd.read_csv('../data/Transactions.csv')
customers = pd.read_csv('../data/Customers.csv')

# Merge datasets
customer_profile = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum', 'Quantity': 'sum'
}).reset_index()

# Normalize the data
scaler = StandardScaler()
customer_profile[['TotalValue', 'Quantity']] = scaler.fit_transform(customer_profile[['TotalValue', 'Quantity']])

# Compute similarity
similarity_matrix = cosine_similarity(customer_profile[['TotalValue', 'Quantity']])

# Find top 3 similar customers
lookalike_results = {}
for idx, row in enumerate(similarity_matrix):
    similar_indices = row.argsort()[::-1][1:4]
    similar_customers = customer_profile.iloc[similar_indices]['CustomerID'].tolist()
    similar_scores = row[similar_indices]
    lookalike_results[customer_profile.iloc[idx]['CustomerID']] = [
        (similar_customers[i], round(similar_scores[i], 2)) for i in range(3)
    ]

# Save results
lookalike_df = pd.DataFrame([
    {"cust_id": k, "similar_custs": v} for k, v in lookalike_results.items()
])
lookalike_df.to_csv('Nikhil_Baghel_Lookalike.csv', index=False)
