In [1]:
# Lookalike Model: Recommending Similar Customers

import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
customers = pd.read_csv("Customers.csv")

In [3]:
customers.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [4]:
transaction_data = pd.read_csv("Transactions.csv")
transaction_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [6]:
merged_customer_data = pd.merge(transaction_data, customers, on = 'CustomerID', how = "inner")

In [7]:
merged_customer_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04


In [13]:
features = merged_customer_data[[ 'Price', 'Quantity', 'TotalValue']]
features.head()

Unnamed: 0,Price,Quantity,TotalValue
0,300.68,1,300.68
1,137.54,4,550.16
2,417.37,2,834.74
3,146.85,2,293.7
4,300.68,1,300.68


In [14]:
# Normalize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [15]:
# Step 3: Compute Similarity
# Compute pairwise cosine similarity between customers
similarity_matrix = cosine_similarity(features_scaled)

In [29]:
# Step 4: Find Top 3 Lookalikes
lookalike_map = {}

for i, customer_id in enumerate(customers['CustomerID']):
    print(f"Processing customer {customer_id} ({i}/{len(customers)})")
    # Get similarity scores for customer i
    similarities = similarity_matrix[i]

    similar_customers = []
    for j, score in enumerate(similarities):
        if j != i:
            try:
                similar_customer_id = customers['CustomerID'].iloc[j]  # Check if this goes out of bounds
                similar_customers.append((similar_customer_id, score))
            except IndexError:
                print(f"IndexError at j={j} for customer {customer_id}")
                continue

    # Sort and pick top 3 lookalikes
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]
    
    lookalike_map[customer_id] = similar_customers
    if j < len(customers):  # Only proceed if j is within the bounds of the customers DataFrame
        similar_customer_id = customers['CustomerID'].iloc[j]


Processing customer C0001 (0/200)
Processing customer C0002 (1/200)
Processing customer C0003 (2/200)
Processing customer C0004 (3/200)
Processing customer C0005 (4/200)
Processing customer C0006 (5/200)
Processing customer C0007 (6/200)
Processing customer C0008 (7/200)
Processing customer C0009 (8/200)
Processing customer C0010 (9/200)
Processing customer C0011 (10/200)
Processing customer C0012 (11/200)
Processing customer C0013 (12/200)
Processing customer C0014 (13/200)
Processing customer C0015 (14/200)
Processing customer C0016 (15/200)
Processing customer C0017 (16/200)
Processing customer C0018 (17/200)
Processing customer C0019 (18/200)
Processing customer C0020 (19/200)
Processing customer C0021 (20/200)
Processing customer C0022 (21/200)
Processing customer C0023 (22/200)
Processing customer C0024 (23/200)
Processing customer C0025 (24/200)
Processing customer C0026 (25/200)
Processing customer C0027 (26/200)
Processing customer C0028 (27/200)
Processing customer C0029 (28/

In [30]:
lookalike_df = pd.DataFrame([
    {'cust_id': cust_id, 'lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv('Lookalike.csv', index=False)