# Task 2: Lookalike Model

In [1]:

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers_path = "Customers.csv"
transactions_path = "Transactions.csv"
products_path = "Products.csv"

customers_df = pd.read_csv(customers_path)
transactions_df = pd.read_csv(transactions_path)
products_df = pd.read_csv(products_path)

# Convert date columns to datetime format
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'], errors='coerce')
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'], errors='coerce')

# Merge datasets
customer_transactions = pd.merge(transactions_df, customers_df, on="CustomerID", how="inner")
full_data = pd.merge(customer_transactions, products_df, on="ProductID", how="inner")

# Feature Engineering: Aggregate transaction data
customer_features = full_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count',  # Transaction count
    'Category': lambda x: x.nunique()  # Unique categories purchased
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TransactionCount',
    'Category': 'UniqueCategories'
})

# Add region as a categorical feature (one-hot encoding)
region_dummies = pd.get_dummies(customers_df.set_index('CustomerID')['Region'], prefix='Region')

# Combine numerical and categorical features
customer_features = customer_features.join(region_dummies)

# Normalize numerical features for similarity calculations
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'TransactionCount', 'UniqueCategories']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_features)

# Create a mapping of CustomerIDs to similarity scores
customer_ids = customer_features.index.tolist()
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

# Extract top 3 similar customers for each customer in the range C0001 to C0020
top_customers = {}
for customer_id in customer_ids[:20]:  # Limiting to first 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    top_customers[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert the result into a DataFrame
lookalike_data = []
for cust_id, similar_list in top_customers.items():
    for similar_cust, score in similar_list:
        lookalike_data.append([cust_id, similar_cust, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=["CustomerID", "SimilarCustomerID", "SimilarityScore"])

# Save the Lookalike data
lookalike_csv_path = "Luv_Pahwa_Lookalike.csv"
lookalike_df.to_csv(lookalike_csv_path, index=False)
