In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
customers_df = pd.read_csv(r"D:\z\Zeotap\Customers - Customers.csv")
products_df = pd.read_csv(r"D:\z\Zeotap\Products - Products.csv")
transactions_df = pd.read_csv(r"D:\z\Zeotap\Transactions - Transactions.csv")


In [None]:
# Merge datasets
transactions_products = pd.merge(transactions_df, products_df, on="ProductID", how="left")
full_data = pd.merge(transactions_products, customers_df, on="CustomerID", how="left")

# Convert dates to datetime
full_data["TransactionDate"] = pd.to_datetime(full_data["TransactionDate"])
full_data["SignupDate"] = pd.to_datetime(full_data["SignupDate"])

In [None]:
# Create customer profiles
customer_profiles = full_data.groupby("CustomerID").agg(
    total_spend=("TotalValue", "sum"),
    transaction_count=("TransactionID", "count"),
    avg_spend_per_transaction=("TotalValue", "mean"),
    unique_products=("ProductID", "nunique"),
    signup_date=("SignupDate", "first"),
    most_common_category=("Category", lambda x: x.mode()[0]),
    region=("Region", "first")
).reset_index()

In [None]:
# One-hot encode categorical features
customer_profiles = pd.get_dummies(customer_profiles, columns=["most_common_category", "region"])

In [None]:
# Prepare data for similarity computation
customer_ids = customer_profiles["CustomerID"]
profile_features = customer_profiles.drop(columns=["CustomerID", "signup_date"])

# Compute cosine similarity
similarity_matrix = cosine_similarity(profile_features)

In [None]:
# Create lookalike map for the first 20 customers
lookalike_map = {}
for idx in range(20):  # For customers C0001 to C0020
    customer_id = customer_ids.iloc[idx]
    similarity_scores = similarity_matrix[idx]
    similar_indices = np.argsort(similarity_scores)[::-1][1:4]  # Top 3 (excluding itself)
    similar_customers = [
        (customer_ids.iloc[sim_idx], similarity_scores[sim_idx])
        for sim_idx in similar_indices
    ]
    lookalike_map[customer_id] = similar_customers

In [2]:
# Convert to DataFrame and save as CSV
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(similar_customers)}
    for cust_id, similar_customers in lookalike_map.items()
])
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)

print("FirstName_LastName_Lookalike.csv has been generated with the top 3 lookalikes for the first 20 customers.")


FirstName_LastName_Lookalike.csv has been generated with the top 3 lookalikes for the first 20 customers.
