Task 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist
from datetime import datetime

In [2]:
customers = pd.read_csv('/kaggle/input/zeotap-assignment-dataset/Customers.csv')
products = pd.read_csv('/kaggle/input/zeotap-assignment-dataset/Products.csv')
transactions = pd.read_csv('/kaggle/input/zeotap-assignment-dataset/Transactions.csv')

In [3]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'], errors='coerce')
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'], errors='coerce')
products['Price'].fillna(products['Price'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  products['Price'].fillna(products['Price'].mean(), inplace=True)


In [4]:
trans_agg = transactions.groupby('CustomerID').agg({'TotalValue':['sum','mean','count']})
trans_agg.columns = ['total_spend','avg_spend','purchase_count']
trans_agg.reset_index(inplace=True)
merged_df = pd.merge(transactions, products, on='ProductID', how='left')
cat_agg = merged_df.groupby(['CustomerID','Category'])['Quantity'].sum().unstack().fillna(0)


In [5]:
cust_features = pd.merge(customers, trans_agg, on='CustomerID', how='left')
cust_features = pd.merge(cust_features, cat_agg, on='CustomerID', how='left')

In [6]:
cust_features['SignupYear'] = pd.to_datetime(cust_features['SignupDate'], errors='coerce').dt.year
dummies_region = pd.get_dummies(cust_features['Region'], prefix='Region')
cust_features = pd.concat([cust_features, dummies_region], axis=1)
cust_features.fillna(0, inplace=True)

In [7]:
excluded = ['CustomerID','CustomerName','SignupDate','Region']
final_cols = [c for c in cust_features.columns if c not in excluded]

if len(cust_features) < 4:
    print("Not enough customers to generate top-3 lookalikes.")
    exit()

X = cust_features[final_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


also can use the cosinesimilarity from sklearn.metrics.pairwise, but i used to define by myself although the results are same

In [12]:

def cosine_dist(mat):
    dot = np.dot(mat, mat.T)
    norms = np.linalg.norm(mat, axis=1)
    denom = np.outer(norms, norms)
    return 1 - (dot / (denom + 1e-10))

def get_neighbors(dist_mat, idx, top_n=3):
    arr = dist_mat[idx].copy()
    arr[idx] = np.inf
    return np.argsort(arr)[:top_n]

metrics = ['euclidean','cityblock']  
results_all = {}
avg_dists = {}

for m in metrics:
    dist_m = cdist(X_scaled, X_scaled, metric=m)
    total, count = 0, 0
    map_m = {}
    for i, cid in enumerate(cust_features['CustomerID']):
        neighbors = get_neighbors(dist_m, i, 3)
        neighbor_list = []
        for n in neighbors:
            neighbor_list.append((cust_features.loc[n,'CustomerID'], round(dist_m[i][n],4)))
            total += dist_m[i][n]
            count += 1
        map_m[cid] = neighbor_list
    avg_dists[m] = total/count if count else np.inf
    results_all[m] = map_m

cosine_matrix = cosine_dist(X_scaled)
cosine_map = {}
cosine_total, cosine_count = 0, 0
for i, cid in enumerate(cust_features['CustomerID']):
    nbrs = get_neighbors(cosine_matrix, i, 3)
    nb_list = []
    for n in nbrs:
        # For cosine, a smaller distance => higher similarity
        # We can convert distance to similarity = 1 - distance if we want a "similarity score"
        sim_score = 1 - cosine_matrix[i][n]
        nb_list.append((cust_features.loc[n,'CustomerID'], round(sim_score,4)))
        cosine_total += cosine_matrix[i][n]
        cosine_count += 1
    cosine_map[cid] = nb_list

avg_dists['cosine'] = (cosine_total / cosine_count) if cosine_count else np.inf
results_all['cosine'] = cosine_map

best_metric = min(avg_dists, key=avg_dists.get)
best_results = results_all[best_metric]



In [15]:

final_map = {}
for cid in cust_features['CustomerID']:
    if cid in [f"C{str(i).zfill(4)}" for i in range(1,21)]:
        final_map[cid] = best_results[cid]

rows = []
for cid in final_map:
    rows.append({'cust_id': cid, 'lookalikes': str(final_map[cid])})

lookalike_df = pd.DataFrame(rows)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Metrics tried:", metrics + ['cosine'])
print("Average distances (lower is better):", avg_dists)
print("Best metric chosen:", best_metric)
print("First 5 rows of final lookalikes:")
print(lookalike_df.head(20))

Metrics tried: ['euclidean', 'cityblock', 'cosine']
Average distances (lower is better): {'euclidean': 1.9396338833518807, 'cityblock': 4.08392282038032, 'cosine': 0.16540163280989106}
Best metric chosen: cosine
First 5 rows of final lookalikes:
   cust_id                                         lookalikes
0    C0001  [('C0184', 0.8153), ('C0152', 0.8153), ('C0192...
1    C0002  [('C0159', 0.94), ('C0134', 0.8803), ('C0106',...
2    C0003  [('C0031', 0.8934), ('C0195', 0.8681), ('C0129...
3    C0004  [('C0113', 0.9274), ('C0104', 0.7778), ('C0012...
4    C0005  [('C0007', 0.9616), ('C0159', 0.8451), ('C0140...
5    C0006  [('C0187', 0.889), ('C0048', 0.8019), ('C0158'...
6    C0007  [('C0005', 0.9616), ('C0140', 0.8655), ('C0159...
7    C0008  [('C0098', 0.8293), ('C0194', 0.8275), ('C0046...
8    C0009  [('C0198', 0.9178), ('C0060', 0.9142), ('C0119...
9    C0010  [('C0062', 0.843), ('C0135', 0.722), ('C0198',...
10   C0011  [('C0153', 0.8362), ('C0118', 0.8249), ('C0107...
11   C0012