In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Feature Engineering
# Aggregate transaction features for each customer
customer_features = transactions.groupby("CustomerID").agg({
    "TotalValue": ["mean", "sum"],  # Avg and total transaction value
    "Quantity": "sum"              # Total quantity purchased
}).reset_index()
customer_features.columns = ["CustomerID", "AvgTransactionValue", "TotalSpent", "TotalQuantity"]

# Add region from customers data
customer_features = customer_features.merge(customers[["CustomerID", "Region"]], on="CustomerID", how="left")

# Encode region (convert categorical to numerical)
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)

# Normalize features for similarity calculation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=["CustomerID"]))

# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Create the Lookalike Model
# For each customer, find the top 3 most similar customers
lookalike_map = {}
customer_ids = customer_features["CustomerID"].tolist()

for i, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similarity_scores = list(enumerate(similarity_matrix[i]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3 = [(customer_ids[j], round(score, 4)) for j, score in similarity_scores[1:4]]  # Exclude self
    lookalike_map[customer_id] = top_3

# Convert lookalike_map to a DataFrame
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id,
     "Lookalikes": str(lookalike_map[cust_id])}
    for cust_id in lookalike_map
])

# Save to Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Model results saved to Lookalike.csv.")


Lookalike Model results saved to Lookalike.csv.


In [12]:
from google.colab import files
uploaded = files.upload()


Saving Transactions.csv to Transactions.csv
Saving Products.csv to Products.csv
Saving Customers.csv to Customers.csv


In [16]:
import os
print(os.listdir())


['.config', 'Transactions.csv', 'Business_Insights.pdf', 'Customers.csv', 'drive', 'Products.csv', 'Lookalike.csv', 'sample_data']


In [17]:

print(lookalike_df.head())


  CustomerID                                         Lookalikes
0      C0001  [('C0137', 0.9957), ('C0107', 0.988), ('C0152'...
1      C0002  [('C0088', 0.9919), ('C0142', 0.9753), ('C0043...
2      C0003  [('C0190', 0.9773), ('C0001', 0.9641), ('C0133...
3      C0004  [('C0113', 0.9872), ('C0102', 0.9674), ('C0165...
4      C0005  [('C0159', 0.9959), ('C0123', 0.9814), ('C0146...
