In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the datasets
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')  # Added this line
customer_metrics = pd.read_csv('customer_metrics.csv')

In [3]:
# Create customer features
def create_customer_features():
    # RFM Analysis
    today = pd.to_datetime(transactions_df['TransactionDate']).max()
    
    rfm = transactions_df.groupby('CustomerID').agg({
        'TransactionDate': lambda x: (today - pd.to_datetime(x.max())).days,  # Recency
        'TransactionID': 'count',  # Frequency
        'TotalValue': 'sum'  # Monetary
    })
    
    rfm.columns = ['Recency', 'Frequency', 'Monetary']
    
    # Category preferences
    category_preferences = pd.pivot_table(
        transactions_df.merge(products_df, on='ProductID'),
        index='CustomerID',
        columns='Category',
        values='Quantity',
        aggfunc='sum',
        fill_value=0
    )
    
    # Combine features
    customer_features = rfm.join(category_preferences)
    
    return customer_features

In [4]:
# Create features and normalize
customer_features = create_customer_features()
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features)
similarity_matrix = cosine_similarity(normalized_features)

In [5]:
# Create lookup dictionary for customer indices
customer_indices = {cust_id: idx for idx, cust_id in enumerate(customer_features.index)}

In [6]:
def get_top_lookalikes(customer_id, n=3):
    if customer_id not in customer_indices:
        return []
    
    idx = customer_indices[customer_id]
    similar_scores = similarity_matrix[idx]
    
    # Get top N similar customers (excluding self)
    similar_indices = similar_scores.argsort()[::-1][1:n+1]
    
    return [(customer_features.index[i], similar_scores[i]) for i in similar_indices]


In [7]:
# Generate lookalikes for customers C0001-C0020
lookalike_results = {}
for i in range(1, 21):
    customer_id = f'C{i:04d}'
    lookalikes = get_top_lookalikes(customer_id)
    lookalike_results[customer_id] = lookalikes


In [8]:
# Create and save the Lookalike.csv
lookalike_df = pd.DataFrame(columns=['CustomerID', 'Lookalike1', 'Score1', 
                                     'Lookalike2', 'Score2', 
                                     'Lookalike3', 'Score3'])

for cust_id, lookalikes in lookalike_results.items():
    row = {
        'CustomerID': cust_id,
        'Lookalike1': lookalikes[0][0] if lookalikes else None,
        'Score1': lookalikes[0][1] if lookalikes else None,
        'Lookalike2': lookalikes[1][0] if len(lookalikes) > 1 else None,
        'Score2': lookalikes[1][1] if len(lookalikes) > 1 else None,
        'Lookalike3': lookalikes[2][0] if len(lookalikes) > 2 else None,
        'Score3': lookalikes[2][1] if len(lookalikes) > 2 else None
    }
    lookalike_df = lookalike_df.append(row, ignore_index=True)

lookalike_df.to_csv('Pratap_Pawar_Lookalike.csv', index=False)