<a href="https://colab.research.google.com/github/natalrhyme/eCommerce-Transactions-dataset/blob/main/Rishabh_sangwan_lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

merged_data = transactions.merge(customers, on="CustomerID", how="left").merge(products, on="ProductID", how="left")

# Create a customer-level profile
customer_profiles = merged_data.groupby("CustomerID").agg({
    "Region": lambda x: x.iloc[0],
    "ProductName": lambda x: ' '.join(x),  # Concatenate all product names purchased
    "TotalValue": "sum"
}).reset_index()

customer_profiles["Profile"] = customer_profiles["Region"] + " " + customer_profiles["ProductName"]

vectorizer = TfidfVectorizer()
profile_matrix = vectorizer.fit_transform(customer_profiles["Profile"])

similarity_matrix = cosine_similarity(profile_matrix)

# Extract the top 3 similar customers for each of the first 20 customers
lookalikes = {}
customer_ids = customer_profiles["CustomerID"].values
for i, cust_id in enumerate(customer_ids[:20]):
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[j], score) for j, score in similar_customers[1:4]]  # Skip self-comparison
    lookalikes[cust_id] = top_3

# Save lookalikes to Lookalike.csv
lookalike_data = []
for cust_id, similars in lookalikes.items():
    for similar_cust, score in similars:
        lookalike_data.append({"cust_id": cust_id, "similar_cust": similar_cust, "score": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("Lookalike.csv", index=False)
