In [None]:
# Task 2: Lookalike Model

# Required Libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

from google.colab import files

print("upload 'Customers.csv'")
uploaded = files.upload()
customers = pd.read_csv(next(iter(uploaded.keys())))

print("upload 'Products.csv'")
uploaded = files.upload()
products = pd.read_csv(next(iter(uploaded.keys())))

print(" upload 'Transactions.csv'")
uploaded = files.upload()
transactions = pd.read_csv(next(iter(uploaded.keys())))

merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

## Calculate total spend, average spend, and purchase frequency per customer
customer_features = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    avg_spend=('TotalValue', 'mean'),
    purchase_frequency=('TransactionID', 'count')
).reset_index()

preferred_category = merged_data.groupby(['CustomerID', 'Category'])['TotalValue'].sum().reset_index()
preferred_category = preferred_category.loc[preferred_category.groupby('CustomerID')['TotalValue'].idxmax()]
preferred_category = preferred_category[['CustomerID', 'Category']]

data = pd.merge(customer_features, preferred_category, on='CustomerID', how='left')

data = pd.get_dummies(data, columns=['Category'], drop_first=True)

# Normalizing features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data.iloc[:, 1:])  # Exclude CustomerID from scaling

# Similarity Calculation
similarity_matrix = cosine_similarity(scaled_features)

# Find top 3 similar customers for CustomerIDs C0001 - C0020
customer_ids = data['CustomerID'][:20]
lookalikes = {}

for idx, customer_id in enumerate(customer_ids):
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]  # Exclude self (index 0)
    similar_customers = data['CustomerID'].iloc[similar_indices].values
    scores = similarity_matrix[idx, similar_indices]
    lookalikes[customer_id] = list(zip(similar_customers, scores))

lookalike_df = pd.DataFrame(
    [(cust, sim[0], sim[1]) for cust, sims in lookalikes.items() for sim in sims],
    columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore']
)
lookalike_df.to_csv('Monika_R_Lookalike.csv', index=False)

print("Lookalike.csv has been created successfully.")


print("Top 3 lookalikes for the first 20 customers:")
for customer, sims in lookalikes.items():
    print(f"Customer {customer}: {sims}")
