In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [3]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

## Step 2: Lookalike Model

In [5]:
# Aggregate features by customer
customer_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

In [6]:
# Normalize data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features[['TotalValue', 'Quantity']])

In [7]:
# Assuming you have:
# - normalized_features: A NumPy array or Pandas DataFrame containing normalized customer features 
# - customer_features: A Pandas DataFrame containing customer IDs and their features

# 1. Calculate Similarity Matrix
similarity_matrix = cosine_similarity(normalized_features) 

# 2. Find Top 3 Similar Customers for Each Customer
lookalike_data = [] 

for idx, row in enumerate(similarity_matrix):
    # Get indices of top 3 most similar customers (excluding the customer itself)
    # `np.argsort(-row)` sorts the row in descending order and returns indices
    # `[1:4]` selects the indices of the 2nd, 3rd, and 4th most similar customers 
    similar_customer_indices = np.argsort(-row)[1:4] 

    # Create a dictionary to store information about similar customers
    customer_info = {
        'CustomerID': customer_features.iloc[idx]['CustomerID'], 
    }
    
    # Add information about the top 3 similar customers 
    for i, similar_customer_idx in enumerate(similar_customer_indices): 
        similar_customer_id = customer_features.iloc[similar_customer_idx]['CustomerID']
        customer_info[f'SimilarCustomer{i+1}'] = similar_customer_id
        customer_info[f'Score{i+1}'] = row[similar_customer_idx]

    # Append the customer's information to the list
    lookalike_data.append(customer_info) 

# The `lookalike_data` list now contains a dictionary for each customer, 
# where each dictionary includes the customer's ID and the IDs and similarity scores 
# of their top 3 most similar customers.

# Example usage:
# print(lookalike_data[0])  # Print information about the first customer and their lookalikes

In [8]:
# Create DataFrame for output
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

In [9]:
lookalike_df.head()

Unnamed: 0,CustomerID,SimilarCustomer1,Score1,SimilarCustomer2,Score2,SimilarCustomer3,Score3
0,C0001,C0085,0.999999,C0042,0.999822,C0089,0.999785
1,C0002,C0157,0.999994,C0166,0.999875,C0029,0.999825
2,C0003,C0111,0.994008,C0160,0.990455,C0147,0.987638
3,C0004,C0162,1.0,C0165,0.999959,C0090,0.998641
4,C0005,C0080,0.999982,C0167,0.999975,C0177,0.999928


In [10]:
lookalike_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        199 non-null    object 
 1   SimilarCustomer1  199 non-null    object 
 2   Score1            199 non-null    float64
 3   SimilarCustomer2  199 non-null    object 
 4   Score2            199 non-null    float64
 5   SimilarCustomer3  199 non-null    object 
 6   Score3            199 non-null    float64
dtypes: float64(3), object(4)
memory usage: 11.0+ KB
