### Import required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import json

### Step 1: Load and Preprocess Data


In [2]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

### Merge datasets


In [3]:
merged_data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')


### Step 2: Feature Engineering


In [4]:
# Create customer-level features
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count',  # Number of transactions
    'Category': lambda x: x.nunique(),  # Diversity of products purchased
    'TransactionDate': lambda x: (pd.to_datetime(x).max() - pd.to_datetime(x).min()).days + 1  # Activity duration
}).reset_index()
customer_features.rename(columns={'TransactionID': 'NumTransactions', 'Category': 'ProductDiversity'}, inplace=True)


In [5]:
# Normalize features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features[['TotalValue', 'NumTransactions', 'ProductDiversity']])


### Step 3: Compute Similarity Scores


In [6]:
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


### Step 4: Generate Lookalike Recommendations


In [10]:
lookalike_map = {}
all_similarity_scores = []

for idx, customer_id in enumerate(similarity_df.index):
    similar_customers = similarity_df.iloc[idx].sort_values(ascending=False).iloc[1:4]
    
    # Store similarity scores for evaluation
    all_similarity_scores.extend(similar_customers.values)
    
    lookalike_map[customer_id] = [{"cust_id": sim_id, "score": score} for sim_id, score in similar_customers.items()]


In [14]:
# Save to Lookalike.csv
lookalike_df = pd.DataFrame({
    "cust_id": lookalike_map.keys(),
    "lookalikes": [json.dumps(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv('Paras_Wane_Lookalike.csv', index=False)


### Model Accuracy Evaluation


In [15]:
from sklearn.metrics import silhouette_score

silhouette_avg = silhouette_score(scaled_features, np.argmax(similarity_matrix, axis=1))
mean_similarity = np.mean(all_similarity_scores)
median_similarity = np.median(all_similarity_scores)

# Print Model Accuracy and Similarity Scores
print("Model Evaluation:")
print(f"Silhouette Score: {silhouette_avg:.4f}")
print(f"Mean Similarity Score: {mean_similarity:.4f}")
print(f"Median Similarity Score: {median_similarity:.4f}")

Model Evaluation:
Silhouette Score: 0.0252
Mean Similarity Score: 0.9938
Median Similarity Score: 0.9998


In [16]:
print("\nTop 3 Lookalike Customers for the first 20 customers:")
for cust_id in list(lookalike_map.keys())[:20]:
    print(f"{cust_id}: {lookalike_map[cust_id]}")


Top 3 Lookalike Customers for the first 20 customers:
C0001: [{'cust_id': 'C0152', 'score': 0.9999946121880003}, {'cust_id': 'C0122', 'score': 0.9999836589104507}, {'cust_id': 'C0186', 'score': 0.9999592403338075}]
C0002: [{'cust_id': 'C0199', 'score': 0.9997723571957567}, {'cust_id': 'C0010', 'score': 0.9996337031586415}, {'cust_id': 'C0175', 'score': 0.9988707072564544}]
C0003: [{'cust_id': 'C0178', 'score': 0.9999981652158934}, {'cust_id': 'C0035', 'score': 0.9998716529680988}, {'cust_id': 'C0146', 'score': 0.9998382139102657}]
C0004: [{'cust_id': 'C0101', 'score': 0.9998871853612922}, {'cust_id': 'C0160', 'score': 0.9982651867263247}, {'cust_id': 'C0069', 'score': 0.9982395471473554}]
C0005: [{'cust_id': 'C0073', 'score': 0.9999864661084878}, {'cust_id': 'C0064', 'score': 0.9999859217868636}, {'cust_id': 'C0045', 'score': 0.9999847376096453}]
C0006: [{'cust_id': 'C0079', 'score': 0.9999971079567408}, {'cust_id': 'C0114', 'score': 0.9988932573659969}, {'cust_id': 'C0158', 'score': 