In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
products = pd.read_csv(r"C:\Users\abuob\Downloads\Products.csv")
customers = pd.read_csv(r"C:\Users\abuob\Downloads\Customers.csv")
transactions = pd.read_csv(r"C:\Users\abuob\Downloads\Transactions.csv")

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

In [5]:
merged_data = pd.merge(transactions, customers, on="CustomerID")
merged_data = pd.merge(merged_data, products, on="ProductID")

In [6]:
# Create customer-level features
customer_features = merged_data.groupby("CustomerID").agg({
    "Quantity": "sum",             # Total quantity purchased
    "Price_x": "mean",               # Average price of products purchased
    "Region": "first",             # Region of the customer
    "SignupDate": "first"          # Signup date of the customer
}).reset_index()

In [7]:
# Convert SignupDate to SignupYear
customer_features["SignUpYear"] = pd.to_datetime(customer_features["SignupDate"]).dt.year
customer_features.drop("SignupDate", axis=1, inplace=True)

In [8]:
# One-hot encode the Region column
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

In [9]:
from sklearn.preprocessing import StandardScaler
# Standardize numerical features for similarity calculation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(["CustomerID"], axis=1))

In [10]:
# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [11]:
# Function to get top 3 lookalikes for a given customer
def get_top_3_lookalikes(customer_index, similarity_matrix):
    similarities = list(enumerate(similarity_matrix[customer_index]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self-match
    return [(customer_features.iloc[sim[0]]["CustomerID"], round(sim[1], 3)) for sim in similarities]


In [12]:
# Generate lookalikes for customers C0001 to C0020
lookalikes = {}
for customer_index in range(20):  # First 20 customers
    customer_id = customer_features.iloc[customer_index]["CustomerID"]
    lookalikes[customer_id] = get_top_3_lookalikes(customer_index, similarity_matrix)

In [14]:
# Example: Check recommendations for a specific customer
customer_index = 0  # Change this index to evaluate other customers
customer_id = customer_features.iloc[customer_index]["CustomerID"]
top_lookalikes = lookalikes[customer_id]

print(f"\nTop 3 Lookalikes for Customer {customer_id}:")
for similar_customer, score in top_lookalikes:
    print(f"  Similar Customer: {similar_customer}, Similarity Score: {score}")


Top 3 Lookalikes for Customer C0001:
  Similar Customer: C0011, Similarity Score: 0.993
  Similar Customer: C0152, Similarity Score: 0.986
  Similar Customer: C0118, Similarity Score: 0.965


In [15]:
# 1. Verify logical similarity
# Extract features of the evaluated customer and their top lookalike
evaluated_customer_features = scaled_features[customer_index]
top_lookalike_index = customer_features[customer_features["CustomerID"] == top_lookalikes[0][0]].index[0]
top_lookalike_features = scaled_features[top_lookalike_index]

In [16]:
# Calculate pairwise distance (lower is better for similarity)
distance = pairwise_distances(
    [evaluated_customer_features],
    [top_lookalike_features],
    metric="euclidean"
)
print(f"Euclidean Distance to Top Lookalike: {distance[0][0]}")

Euclidean Distance to Top Lookalike: 0.2624060794952542


In [17]:
# 2. Summary of Scores Distribution
similarity_scores = similarity_matrix[customer_index]
print(f"\nSimilarity Scores Distribution for Customer {customer_id}:")
print(f"  Mean Similarity: {np.mean(similarity_scores):.3f}")
print(f"  Max Similarity: {np.max(similarity_scores):.3f}")
print(f"  Min Similarity: {np.min(similarity_scores):.3f}")


Similarity Scores Distribution for Customer C0001:
  Mean Similarity: 0.006
  Max Similarity: 1.000
  Min Similarity: -0.645
