In [6]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Read the Excel file
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')


In [12]:
print("Customers Data:")
print(customers_df.head())

Customers Data:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15


In [13]:
print("\
Products Data:")
print(products_df.head())

Products Data:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31


In [14]:
# Preprocessing the data
from sklearn.preprocessing import LabelEncoder

In [15]:
# Encode categorical variables
le_region = LabelEncoder()
customers_df['Region_Encoded'] = le_region.fit_transform(customers_df['Region'])

In [16]:
# Convert SignupDate to numerical feature (days since signup)
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
customers_df['DaysSinceSignup'] = (datetime.now() - customers_df['SignupDate']).dt.days

In [17]:
# Drop unnecessary columns for modeling
customers_features = customers_df[['CustomerID', 'Region_Encoded', 'DaysSinceSignup']]


In [18]:
# Normalize numerical features
scaler = StandardScaler()
customers_features[['Region_Encoded', 'DaysSinceSignup']] = scaler.fit_transform(customers_features[['Region_Encoded', 'DaysSinceSignup']])

In [19]:
print("Processed Customer Features:")
print(customers_features.head())

Processed Customer Features:
  CustomerID  Region_Encoded  DaysSinceSignup
0      C0001        1.241384         1.152884
1      C0002       -1.409258         1.605593
2      C0003        1.241384        -0.713387
3      C0004        1.241384         0.872636
4      C0005       -1.409258         1.042017


In [20]:
# Integrate product data for modeling
# For simplicity, let's assume each customer has purchased products from all categories equally (mock data for now)
# Create a mock transaction history by assigning random product categories to customers

np.random.seed(42)  # For reproducibility
categories = products_df['Category'].unique()

In [21]:
# Assign random product categories to customers
customers_features['ProductCategory'] = np.random.choice(categories, size=len(customers_features))


In [22]:
# Encode product categories
le_category = LabelEncoder()
customers_features['ProductCategory_Encoded'] = le_category.fit_transform(customers_features['ProductCategory'])

In [23]:
# Final feature set for similarity
final_features = customers_features[['Region_Encoded', 'DaysSinceSignup', 'ProductCategory_Encoded']]

In [24]:
print("Final Features for Similarity Calculation:")
print(final_features.head())

Final Features for Similarity Calculation:
   Region_Encoded  DaysSinceSignup  ProductCategory_Encoded
0        1.241384         1.152884                        3
1       -1.409258         1.605593                        1
2        1.241384        -0.713387                        0
3        1.241384         0.872636                        3
4       -1.409258         1.042017                        3


In [25]:
# Compute similarity scores using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
# Calculate similarity matrix
similarity_matrix = cosine_similarity(final_features)

In [27]:
# Generate top 3 lookalike recommendations for the first 20 customers
lookalike_map = {}
for i in range(20):  # First 20 customers
    customer_id = customers_features.iloc[i]['CustomerID']
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity score
    top_3 = [(customers_features.iloc[j[0]]['CustomerID'], j[1]) for j in similarities[1:4]]  # Exclude self
    lookalike_map[customer_id] = top_3

In [28]:
# Convert to DataFrame for export
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalike_df.reset_index(inplace=True)
lookalike_df.rename(columns={'index': 'CustomerID'}, inplace=True)

In [29]:
# Save to CSV
lookalike_df.to_csv('Paras_Agrawal_Lookalike.csv.csv', index=False)

In [30]:
print("Lookalike recommendations generated and saved to Paras_Agrawal_Lookalike.csv")
print(lookalike_df.head())

Lookalike recommendations generated and saved to Paras_Agrawal_Lookalike.csv
  CustomerID                   Lookalike1                   Lookalike2  \
0      C0001  (C0112, 0.9999968128364202)  (C0188, 0.9989996288742979)   
1      C0002   (C0045, 0.999932712895011)  (C0175, 0.9980364994060891)   
2      C0003  (C0137, 0.9982380713526587)  (C0031, 0.9980268400125786)   
3      C0004  (C0192, 0.9997772380452074)  (C0011, 0.9983514186268675)   
4      C0005  (C0173, 0.9990075248284113)  (C0141, 0.9962975016258887)   

                    Lookalike3  
0  (C0147, 0.9989629029048788)  
1  (C0198, 0.9343632723650651)  
2  (C0032, 0.9973284031694074)  
3  (C0168, 0.9982988110301299)  
4   (C0161, 0.992266070017548)  
