In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [47]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupYearMonth'] = customers['SignupDate'].dt.to_period('M')
transaction_details = transactions.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')
transaction_details = transaction_details.merge(products[['ProductID', 'Category']], on='ProductID', how='left')


customer_profile = transaction_details.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'Category': pd.Series.nunique,
    'Region': 'first'
}).reset_index()

customer_profile = pd.get_dummies(customer_profile, columns=['Region'], drop_first=True)

scaler = StandardScaler()
numerical_features = ['Quantity', 'TotalValue', 'Category']
customer_profile[numerical_features] = scaler.fit_transform(customer_profile[numerical_features])

def find_lookalikes(customer_id, top_n=3):
    target_customer = customer_profile[customer_profile['CustomerID'] == customer_id].drop('CustomerID', axis=1)
    similarity_scores = cosine_similarity(target_customer, customer_profile.drop('CustomerID', axis=1))
    similarity_df = pd.DataFrame(similarity_scores.T, columns=['SimilarityScore'])
    similarity_df['CustomerID'] = customer_profile['CustomerID']
    similar_customers = similarity_df.sort_values(by='SimilarityScore', ascending=False).iloc[1:top_n+1]

    return similar_customers


In [42]:
#execution
print("Customer id should be in range C0001 to C0020 only")
customer_id = input("Enter customer Id: ")
if customer_id[0] == 'C' and 0<=int(customer_id[3:5])< 21 and len(customer_id)==5:
  lookalikes = find_lookalikes(customer_id)
  print(f"Top 3 lookalikes for customer {customer_id} are:")
  print("----------------------------------------------")
  print(lookalikes)
else:
  print("Please enter Right customer id")

Customer id should be in range C0001 to C0020 only
Enter customer Id: C0012
Top 3 lookalikes for customer C0012 are:
----------------------------------------------
     SimilarityScore CustomerID
147         0.992978      C0148
154         0.982328      C0155
86          0.975366      C0087


In [52]:
# Compute lookalikes for the first 20 customers (C0001 to C0020)
lookalike_map = {}
for i in range(1, 21):
    customer_id = f"C{i:04d}"
    if customer_id in customer_profile['CustomerID'].values:
        similar_customers = find_lookalikes(customer_id)
        lookalike_map[customer_id] = [
            {"CustomerID": row["CustomerID"], "Score": row["SimilarityScore"]}
            for _, row in similar_customers.iterrows()
        ]
    else:
        print(f"Customer {customer_id} not found in the dataset.")

output_data = []
for customer_id, lookalikes in lookalike_map.items():
    lookalikes_str = "; ".join([f"{l['CustomerID']}, {l['Score']:.2f}" for l in lookalikes])
    output_data.append({"CustomerID": customer_id, "Lookalikes": lookalikes_str})

lookalike_df = pd.DataFrame(output_data)
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv has been created successfully.")


Lookalike.csv has been created successfully.
