In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
# Merge transactions with customers to get customer profiles
merged_data = transactions.merge(customers, on='CustomerID')


In [4]:
# Create a customer profile DataFrame
customer_profile = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Region': 'first',  
    'SignupDate': 'first'
}).reset_index()

In [5]:
# Convert SignupDate to datetime and extract features
customer_profile['SignupDate'] = pd.to_datetime(customer_profile['SignupDate'])
customer_profile['SignupYear'] = customer_profile['SignupDate'].dt.year
customer_profile['SignupMonth'] = customer_profile['SignupDate'].dt.month

In [6]:
# One-hot encode the Region column
customer_profile = pd.get_dummies(customer_profile, columns=['Region'], drop_first=True)

In [7]:
# Scale the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profile[['TotalValue', 'Quantity', 'SignupYear', 'SignupMonth']])


In [8]:
# Combine scaled features with one-hot encoded regions
final_profile = pd.DataFrame(scaled_features, columns=['TotalValue', 'Quantity', 'SignupYear', 'SignupMonth'])
final_profile = pd.concat([final_profile, customer_profile.drop(columns=
                                                                ['CustomerID', 'TotalValue', 'Quantity', 'SignupDate', 'SignupYear', 'SignupMonth'])], axis=1)

In [9]:
# Function to find lookalikes
def find_lookalikes(customer_id, top_n=3):
    # Get the profile of the input customer
    input_customer = final_profile.loc[customers['CustomerID'] == customer_id].values
    
    # Calculate cosine similarity
    similarities = cosine_similarity(input_customer, final_profile)
    
    # Get the indices of the top N similar customers
    similar_indices = similarities[0].argsort()[-top_n-1:-1][::-1]  # Exclude the customer itself
    
    # Get the similar customers and their similarity scores
    similar_customers = customers.iloc[similar_indices]
    scores = similarities[0][similar_indices]
    
    return similar_customers[['CustomerID', 'CustomerName']], scores

In [10]:
# Generate lookalikes for the first 20 customers
lookalike_results = []
for customer_id in customers['CustomerID'].head(20):
    similar_customers, scores = find_lookalikes(customer_id)
    for index, (similar_customer_id, score) in enumerate(zip(similar_customers['CustomerID'], scores)):
        lookalike_results.append((customer_id, similar_customer_id, score))

In [11]:
# Convert results to DataFrame for saving
lookalike_df = pd.DataFrame(lookalike_results, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])


In [12]:
# Save the lookalike results to a CSV file
lookalike_df.to_csv('Pakhi_Sharma_Lookalike.csv', index=False)

print("Lookalike model generated and saved to 'Pakhi_Sharma_Lookalike.csv'.")

Lookalike model generated and saved to 'Pakhi_Sharma_Lookalike.csv'.
