In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
customers = pd.read_csv("DATA SET/Customers.csv")
products = pd.read_csv("DATA SET/Products.csv")
transactions = pd.read_csv("DATA SET/Transactions.csv")

In [4]:
# Mergeing data to include transaction and product details
transactions = transactions.merge(products, on="ProductID").merge(customers, on="CustomerID")

In [5]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [8]:
# Aggregate data to build customer profiles
customer_profiles = transactions.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",    # Total quantity purchased
    "Price_y": "mean",      # Average price of purchased products
    "Category": lambda x: x.mode()[0],  # Most common product category
    "Region": "first"     # Region of the customer
}).reset_index()

In [10]:
# One-hot encode categorical features
customer_profiles_encoded = pd.get_dummies(customer_profiles, columns=["Category", "Region"], drop_first=True)

In [11]:
# Standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles_encoded.iloc[:, 1:])

In [13]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles["CustomerID"], columns=customer_profiles["CustomerID"])

In [14]:
# Generate Lookalike recommendations for first 20 customers
lookalikes = {}
for customer in customer_profiles["CustomerID"].head(20):
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]  # Exclude the customer themselves
    lookalikes[customer] = [(sim_customer, score) for sim_customer, score in zip(similar_customers.index, similar_customers.values)]

In [16]:
# Save results to Lookalike.csv
lookalike_df = pd.DataFrame({
    "CustomerID": lookalikes.keys(),
    "Lookalikes": [str(val) for val in lookalikes.values()]
})
lookalike_df.to_csv("Mohammed_Junaid_Lookalike.csv", index=False)