# Implementation of Lookalike Model

In [21]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.preprocessing import StandardScaler

# loading the datasets

In [23]:
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')

# Data from customers and products file

In [28]:
print(customers_df.head())
print(products_df.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31


# Data preprocessing and further for Model Development

In [31]:
#       Preprocessing 
label_encoder = LabelEncoder()
customers_df['Region_encoded'] = label_encoder.fit_transform(customers_df['Region'])

customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate']) #feature engeenering
customers_df['DaysSinceSignup'] = (pd.to_datetime('today') - customers_df['SignupDate']).dt.days

products_df['Category_encoded'] = label_encoder.fit_transform(products_df['Category'])

import random
transaction_data = []

for customer_id in customers_df['CustomerID']:
    # Randomly select 3 products
    selected_products = random.sample(list(products_df['ProductID']), 3)
    for product_id in selected_products:
        # Randomly assigning price
        product_price = products_df[products_df['ProductID'] == product_id]['Price'].values[0]
        transaction_data.append([customer_id, product_id, product_price])

#   DataFrame
transaction_df = pd.DataFrame(transaction_data, columns=['CustomerID', 'ProductID', 'Price'])


customer_product_matrix = transaction_df.pivot_table(index='CustomerID', columns='ProductID', values='Price', aggfunc='sum', fill_value=0)

#  Normalize bu using Standardscaler
scaler = StandardScaler()
normalized_matrix = scaler.fit_transform(customer_product_matrix)

similarity_matrix = cosine_similarity(normalized_matrix)

In [35]:
#       Get top 3 lookalikes 
top_lookalikes = {}

for customer_idx in range(20):  # First 20 customers 
    customer_id = customer_product_matrix.index[customer_idx]
    similarity_scores = similarity_matrix[customer_idx]
    
    
    similar_customer_indices = np.argsort(similarity_scores)[-4:-1]  
    top_3_similar_customers = [(customer_product_matrix.index[i], similarity_scores[i]) for i in similar_customer_indices]
    
    # Store
    top_lookalikes[customer_id] = top_3_similar_customers


lookalike_map = []
for customer_id, lookalikes in top_lookalikes.items():
    lookalike_list = [(similar_customer_id, round(score, 6)) for similar_customer_id, score in lookalikes]
    lookalike_map.append([customer_id, lookalike_list])

lookalike_df = pd.DataFrame(lookalike_map, columns=['CustomerID', 'Lookalikes'])


lookalike_df.to_csv('Lookalike.csv', index=False)


print(lookalike_df.head(20))


   CustomerID                                         Lookalikes
0       C0001  [(C0033, 0.318957), (C0182, 0.377296), (C0008,...
1       C0002  [(C0100, 0.327064), (C0006, 0.333188), (C0157,...
2       C0003  [(C0076, 0.404669), (C0014, 0.407113), (C0157,...
3       C0004  [(C0029, 0.324124), (C0047, 0.324124), (C0145,...
4       C0005  [(C0154, 0.315909), (C0091, 0.490029), (C0043,...
5       C0006  [(C0176, 0.379557), (C0038, 0.433594), (C0031,...
6       C0007  [(C0167, 0.310021), (C0127, 0.316288), (C0134,...
7       C0008  [(C0033, 0.318957), (C0182, 0.377296), (C0001,...
8       C0009  [(C0012, 0.376185), (C0092, 0.397617), (C0199,...
9       C0010  [(C0048, 0.408533), (C0176, 0.409332), (C0057,...
10      C0011  [(C0110, 0.343648), (C0075, 0.390167), (C0171,...
11      C0012  [(C0009, 0.376185), (C0097, 0.4215), (C0173, 0...
12      C0013  [(C0164, 0.378703), (C0163, 0.394501), (C0086,...
13      C0014  [(C0076, 0.394501), (C0157, 0.401071), (C0003,...
14      C0015  [(C0070, 0