In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [9]:
#Load the datasets from CSV files
def load_data(customers_path, products_path, transactions_path):
    customers_df = pd.read_csv(customers_path)
    products_df = pd.read_csv(products_path)
    transactions_df = pd.read_csv(transactions_path)
    
    # Convert dates to datetime
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    return customers_df, products_df, transactions_df


In [10]:
#Calculate various customer-level metrics
def calculate_customer_metrics(transactions_df):
    # Aggregate transactions by customer
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Number of transactions
        'TotalValue': ['sum', 'mean'],  # Total spend and average transaction value
        'Quantity': 'sum',  # Total items purchased
        'TransactionDate': lambda x: (x.max() - x.min()).days  # Activity period
    }).reset_index()
    
    # Flatten column names
    customer_metrics.columns = ['CustomerID', 'num_transactions', 'total_spend', 
                              'avg_transaction_value', 'total_items', 'activity_period']
    
    # Calculate purchase frequency (transactions per month)
    customer_metrics['purchase_frequency'] = np.where(
        customer_metrics['activity_period'] > 0,
        customer_metrics['num_transactions'] / (customer_metrics['activity_period'] / 30),
        0
    )
    
    return customer_metrics

In [11]:
#Calculate customer preferences for different product categories
def calculate_category_preferences(transactions_df, products_df):
    # Merge transactions with products to get categories
    trans_with_categories = pd.merge(
        transactions_df,
        products_df[['ProductID', 'Category']],
        on='ProductID'
    )
    
    # Calculate total spend per category for each customer
    category_spend = pd.pivot_table(
        trans_with_categories,
        values='TotalValue',
        index='CustomerID',
        columns='Category',
        aggfunc='sum',
        fill_value=0
    ).reset_index()
    
    # Normalize category spend
    category_columns = category_spend.columns.difference(['CustomerID'])
    customer_total_spend = category_spend[category_columns].sum(axis=1)
    
    for col in category_columns:
        category_spend[f'{col}_preference'] = category_spend[col] / customer_total_spend
        category_spend = category_spend.drop(col, axis=1)
    
    return category_spend

In [12]:
#Create feature matrix for similarity calculation
def prepare_features(customers_df, products_df, transactions_df):
    # Calculate customer metrics
    customer_metrics = calculate_customer_metrics(transactions_df)
    
    # Calculate category preferences
    category_preferences = calculate_category_preferences(transactions_df, products_df)
    
    # Combine features
    features_df = pd.merge(customer_metrics, category_preferences, 
                          on='CustomerID', how='left')
    
    # Fill NaN values with 0 (for customers with no transactions)
    features_df = features_df.fillna(0)
    
    # Scale features
    scaler = StandardScaler()
    feature_columns = features_df.columns.difference(['CustomerID'])
    features_df[feature_columns] = scaler.fit_transform(features_df[feature_columns])
    
    return features_df

In [13]:
#Find similar customers based on feature similarity
def find_similar_customers(customer_id, features_df, customers_df, n_recommendations=3):
    if customer_id not in features_df['CustomerID'].values:
        raise ValueError(f"Customer ID {customer_id} not found in the dataset")
        
    # Get feature vector for target customer
    target_features = features_df[
        features_df['CustomerID'] == customer_id
    ].iloc[0, 1:]
    
    # Calculate similarity scores
    similarity_scores = cosine_similarity(
        target_features.values.reshape(1, -1),
        features_df.iloc[:, 1:]
    )[0]
    
    # Create recommendations dataframe
    recommendations = pd.DataFrame({
        'CustomerID': features_df['CustomerID'],
        'SimilarityScore': similarity_scores
    })
    
    # Remove the target customer and sort by similarity
    recommendations = recommendations[
        recommendations['CustomerID'] != customer_id
    ].sort_values('SimilarityScore', ascending=False)
    
    # Get top N recommendations
    top_recommendations = recommendations.head(n_recommendations)
    
    # Add customer details
    final_recommendations = pd.merge(
        top_recommendations,
        customers_df[['CustomerID', 'CustomerName', 'Region']],
        on='CustomerID'
    )
    
    return final_recommendations

In [15]:
# Load all data
customers_df, products_df, transactions_df = load_data('customers.csv','products.csv','transactions.csv')

In [16]:
# Prepare features once
features_df = prepare_features(customers_df, products_df, transactions_df)

In [17]:
# Find similar customers for a specific customer
customer_id = "C0035"


In [18]:
similar_customers = find_similar_customers(customer_id, features_df, customers_df)

In [20]:
similar_customers

Unnamed: 0,CustomerID,SimilarityScore,CustomerName,Region
0,C0070,0.863099,Timothy Perez,Europe
1,C0191,0.848529,Samantha Gibson DVM,South America
2,C0131,0.848376,Scott Wilson,North America


In [49]:
lookalike = {}

In [50]:
for values in customers_df['CustomerID'].iloc[:20]:
    # Get similar customers DataFrame
    similar_df = find_similar_customers(values, features_df, customers_df)
        
    # Extract just the CustomerID and SimilarityScore as tuples
    similar_customers = list(zip(similar_df['CustomerID'], 
                                similar_df['SimilarityScore']))
        
    lookalike[values] = similar_customers

In [51]:
lookalike

{'C0001': [('C0069', 0.975864820143065),
  ('C0072', 0.8875788407373617),
  ('C0181', 0.8619847782567192)],
 'C0002': [('C0055', 0.901350192439436),
  ('C0159', 0.894502127045733),
  ('C0036', 0.8734462292819587)],
 'C0003': [('C0007', 0.8483578090264577),
  ('C0005', 0.8289633120919991),
  ('C0026', 0.798685471917687)],
 'C0004': [('C0075', 0.9841434626369872),
  ('C0065', 0.944263669636104),
  ('C0017', 0.9021175180162644)],
 'C0005': [('C0007', 0.9182400254533905),
  ('C0085', 0.9176849160219607),
  ('C0197', 0.8604755338332879)],
 'C0006': [('C0185', 0.9264735280927765),
  ('C0200', 0.8329943902716074),
  ('C0138', 0.7836069037423007)],
 'C0007': [('C0085', 0.9340301695135796),
  ('C0005', 0.9182400254533903),
  ('C0026', 0.8519587130103928)],
 'C0008': [('C0109', 0.8874905803940831),
  ('C0162', 0.8386488377481048),
  ('C0113', 0.8121856871792097)],
 'C0009': [('C0032', 0.9167532404174195),
  ('C0083', 0.8012344531257924),
  ('C0033', 0.7797570266960784)],
 'C0010': [('C0029', 0.9

In [39]:
# Flatten the dictionary into a list of tuples
flattened_data = []
for customerID, values in lookalike.items():
    for similar_id, similarity in values:
        flattened_data.append([customerID, similar_id, similarity])

In [40]:
flattened_data

[['C0001', 'C0069', 0.975864820143065],
 ['C0001', 'C0072', 0.8875788407373617],
 ['C0001', 'C0181', 0.8619847782567192],
 ['C0002', 'C0055', 0.901350192439436],
 ['C0002', 'C0159', 0.894502127045733],
 ['C0002', 'C0036', 0.8734462292819587],
 ['C0003', 'C0007', 0.8483578090264577],
 ['C0003', 'C0005', 0.8289633120919991],
 ['C0003', 'C0026', 0.798685471917687],
 ['C0004', 'C0075', 0.9841434626369872],
 ['C0004', 'C0065', 0.944263669636104],
 ['C0004', 'C0017', 0.9021175180162644],
 ['C0005', 'C0007', 0.9182400254533905],
 ['C0005', 'C0085', 0.9176849160219607],
 ['C0005', 'C0197', 0.8604755338332879],
 ['C0006', 'C0185', 0.9264735280927765],
 ['C0006', 'C0200', 0.8329943902716074],
 ['C0006', 'C0138', 0.7836069037423007],
 ['C0007', 'C0085', 0.9340301695135796],
 ['C0007', 'C0005', 0.9182400254533903],
 ['C0007', 'C0026', 0.8519587130103928],
 ['C0008', 'C0109', 0.8874905803940831],
 ['C0008', 'C0162', 0.8386488377481048],
 ['C0008', 'C0113', 0.8121856871792097],
 ['C0009', 'C0032', 0

In [52]:
# Convert to DataFrame
df = pd.DataFrame(flattened_data, columns=['Customer_ID', 'Similar_Customer_ID', 'Similarity'])

In [53]:
df

Unnamed: 0,Customer_ID,Similar_Customer_ID,Similarity
0,C0001,C0069,0.975865
1,C0001,C0072,0.887579
2,C0001,C0181,0.861985
3,C0002,C0055,0.90135
4,C0002,C0159,0.894502
5,C0002,C0036,0.873446
6,C0003,C0007,0.848358
7,C0003,C0005,0.828963
8,C0003,C0026,0.798685
9,C0004,C0075,0.984143


In [54]:
# Save to CSV
df.to_csv('Vaisakh_krishna_lookalike.csv', index=False)