In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

In [4]:
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')


In [5]:
merged = transactions.merge(products, on="ProductID").merge(customers, on="CustomerID")


In [6]:
encoder = OneHotEncoder()
encoded_region = encoder.fit_transform(customers[["Region"]]).toarray()

In [7]:
customer_features = merged.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Category": lambda x: list(x) if isinstance(x, list) else []
}).reset_index()

In [8]:
region_df = pd.DataFrame(encoded_region, columns=encoder.get_feature_names_out(["Region"]))
customer_features = pd.concat([customer_features, region_df], axis=1)

In [9]:
category_encoder = OneHotEncoder()
category_encoded = category_encoder.fit_transform(
    customer_features["Category"].apply(lambda x: x if isinstance(x, list) else []).explode().to_frame()
).toarray()

In [14]:
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(customer_features[["TotalValue", "Quantity"]].fillna(0))
customer_matrix = np.hstack((
    numeric_features,
    category_encoded
))


In [15]:
similarity_matrix = cosine_similarity(customer_matrix)

top_lookalikes = {}
customer_ids = customers["CustomerID"][:20]

In [16]:
for idx, customer_id in enumerate(customer_ids):
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]
    top_lookalikes[customer_id] = [
        (customers.iloc[sim_idx]["CustomerID"], similarity_matrix[idx, sim_idx])
        for sim_idx in similar_indices
    ]


In [19]:
lookalike_data = {"CustomerID": [], "Lookalikes": []}
for cust_id, lookalikes in top_lookalikes.items():
    lookalike_data["CustomerID"].append(cust_id)
    lookalike_data["Lookalikes"].append(lookalikes)

lookalike_df = pd.DataFrame(lookalike_data)
output_file = "Yash_Sapkal_Lookalike.csv"
lookalike_df.to_csv(output_file, index=False)

output_file

'Yash_Sapkal_Lookalike.csv'