In [4]:
import pandas as pd

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

In [6]:
total_spend = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spend.columns = ['CustomerID', 'TotalSpend']

In [7]:
num_transactions = merged_data.groupby('CustomerID')['TransactionID'].count().reset_index()
num_transactions.columns = ['CustomerID', 'NumTransactions']

In [8]:
avg_transaction_value = merged_data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']

In [9]:
category_pivot = merged_data.pivot_table(index='CustomerID', columns='Category', values='TransactionID', aggfunc='count', fill_value=0)

In [10]:
region_pivot = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'], prefix='Region')

In [11]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['DaysSinceSignup'] = (pd.Timestamp.now() - customers['SignupDate']).dt.days

In [12]:
customer_features = pd.merge(total_spend, num_transactions, on='CustomerID')
customer_features = pd.merge(customer_features, avg_transaction_value, on='CustomerID')
customer_features = pd.merge(customer_features, category_pivot, on='CustomerID')
customer_features = pd.merge(customer_features, region_pivot, on='CustomerID')
customer_features = pd.merge(customer_features, customers[['CustomerID', 'DaysSinceSignup']], on='CustomerID')

In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))
scaled_features = pd.DataFrame(scaled_features, columns=customer_features.columns[1:])
scaled_features['CustomerID'] = customer_features['CustomerID']

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
similarity_matrix = cosine_similarity(scaled_features.drop(columns=['CustomerID']))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [15]:
lookalike_map = {}

for customer in customer_features['CustomerID']:
    # Get similarity scores for the customer
    similarities = similarity_df[customer].sort_values(ascending=False)
    # Exclude the customer itself
    similarities = similarities[similarities.index != customer]
    # Get top 3 lookalikes
    top_3 = similarities.head(3).reset_index()
    top_3.columns = ['LookalikeCustomerID', 'SimilarityScore']
    # Store in map
    lookalike_map[customer] = list(zip(top_3['LookalikeCustomerID'], top_3['SimilarityScore']))

In [16]:
import csv

with open('Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
    for customer, lookalikes in lookalike_map.items():
        for lookalike, score in lookalikes:
            writer.writerow([customer, lookalike, score])

In [17]:
for customer in customer_features['CustomerID'][:20]:
    print(f"Customer: {customer}")
    print("Top 3 Lookalikes:")
    for lookalike, score in lookalike_map[customer]:
        print(f"  - {lookalike}: {score:.4f}")
    print()

Customer: C0001
Top 3 Lookalikes:
  - C0112: 0.9777
  - C0192: 0.9738
  - C0118: 0.9720

Customer: C0002
Top 3 Lookalikes:
  - C0134: 0.9906
  - C0159: 0.9783
  - C0106: 0.9767

Customer: C0003
Top 3 Lookalikes:
  - C0031: 0.9929
  - C0129: 0.9859
  - C0158: 0.9728

Customer: C0004
Top 3 Lookalikes:
  - C0113: 0.9852
  - C0104: 0.9714
  - C0174: 0.9604

Customer: C0005
Top 3 Lookalikes:
  - C0007: 0.9959
  - C0140: 0.9562
  - C0186: 0.9489

Customer: C0006
Top 3 Lookalikes:
  - C0187: 0.9797
  - C0085: 0.9494
  - C0169: 0.9491

Customer: C0007
Top 3 Lookalikes:
  - C0005: 0.9959
  - C0140: 0.9657
  - C0159: 0.9468

Customer: C0008
Top 3 Lookalikes:
  - C0098: 0.9543
  - C0194: 0.9524
  - C0059: 0.9469

Customer: C0009
Top 3 Lookalikes:
  - C0062: 0.9573
  - C0010: 0.9572
  - C0061: 0.9406

Customer: C0010
Top 3 Lookalikes:
  - C0061: 0.9733
  - C0062: 0.9692
  - C0009: 0.9572

Customer: C0011
Top 3 Lookalikes:
  - C0174: 0.9811
  - C0153: 0.9792
  - C0169: 0.9724

Customer: C0012
Top 3