In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [5]:
# Step 1: Prepare the data
# Merge transactions with customer and product information
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [7]:
# Create a pivot table for customer-product purchase behavior
customer_product_matrix = merged_data.pivot_table(
    index='CustomerID', 
    columns='ProductID', 
    values='Quantity', 
    aggfunc='sum', 
    fill_value=0
)


In [9]:
# Standardize the data for similarity calculations
scaler = StandardScaler()
customer_product_matrix_scaled = scaler.fit_transform(customer_product_matrix)


In [11]:
# Step 2: Calculate similarity
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_product_matrix_scaled)
similarity_df = pd.DataFrame(
    similarity_matrix, 
    index=customer_product_matrix.index, 
    columns=customer_product_matrix.index
)

In [13]:
# Step 3: Generate lookalike recommendations
# Function to get top N similar customers for a given customer
def get_top_similar_customers(customer_id, top_n=3):
    if customer_id not in similarity_df.index:
        return []
    # Sort by similarity score and exclude the customer itself
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return [(cust_id, round(score, 4)) for cust_id, score in similar_customers.items()]


In [21]:
# Step 4: Evaluate model logic and quality
# Train-test split on the customer-product matrix to simulate recommendations
def evaluate_model():
    train_data, test_data = train_test_split(customer_product_matrix, test_size=0.2, random_state=42)
    
    # Recompute similarity matrix using only the training data
    train_scaled = scaler.fit_transform(train_data)
    train_similarity_matrix = cosine_similarity(train_scaled)
    train_similarity_df = pd.DataFrame(
        train_similarity_matrix, 
        index=train_data.index, 
        columns=train_data.index
    )
    
    # Generate recommendations for test customers
    mse_scores = []
    for customer_id in test_data.index:
        if customer_id not in train_similarity_df.index:
            continue
        # Get similar customers from the training set
        recommended_customers = get_top_similar_customers(customer_id, top_n=3)
        
        # Compare with test data (simulated ground truth)
        actual_purchases = test_data.loc[customer_id].to_numpy()
        predicted_purchases = np.mean(
            [train_data.loc[sim_cust[0]].to_numpy() for sim_cust in recommended_customers], axis=0
        ) if recommended_customers else np.zeros_like(actual_purchases)
        
        # Calculate MSE
        mse = mean_squared_error(actual_purchases, predicted_purchases)
        mse_scores.append(mse)
    
    # Return the average MSE as a measure of model accuracy
    return np.mean(mse_scores)


In [23]:
# Evaluate the model accuracy
model_accuracy = evaluate_model()
print(f"Model Mean Squared Error (MSE): {model_accuracy:.4f}")

Model Mean Squared Error (MSE): nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [25]:
# Step 5: Save lookalike recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customers['CustomerID'][:20]:
    lookalike_results[customer_id] = get_top_similar_customers(customer_id, top_n=3)


In [27]:
# Convert results to the required format
lookalike_list = []
for cust_id, similar_customers in lookalike_results.items():
    for similar_cust, score in similar_customers:
        lookalike_list.append({"CustomerID": cust_id, "SimilarCustomerID": similar_cust, "Score": score})

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv("Lookalike.csv", index=False)

In [29]:
# Explanation of Code Steps:
# 1. We merged the transactions, customers, and products datasets to create a comprehensive dataset.
# 2. A pivot table was created to capture customer-product purchase behavior.
# 3. The data was standardized to ensure all features contributed equally to the similarity calculations.
# 4. Cosine similarity was used to calculate the similarity scores between customers.
# 5. We evaluated the model's performance using a train-test split and calculated the Mean Squared Error (MSE).
# 6. Recommendations for the first 20 customers were generated and saved in the required format (Lookalike.csv).
