In [1]:

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# 1. Data Preparation

In [2]:
# Load preprocessed data from Task 1
customers = pd.read_csv('Customers.csv', parse_dates=['SignupDate'])
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge datasets
merged_data = (
    transactions.merge(customers, on='CustomerID')
    .merge(products, on='ProductID')
)

# 2. Feature Engineering

In [3]:
# ----------------------
# ----------------------

def create_customer_features(df):
    """Create comprehensive customer features"""
    
    # Demographic Features
    demo_features = df[['CustomerID', 'Region', 'SignupDate']].drop_duplicates()
    
    # Transaction Behavioral Features
    txn_features = df.groupby('CustomerID').agg(
        total_spend=('TotalValue', 'sum'),
        purchase_freq=('TransactionID', 'count'),
        avg_basket_size=('Quantity', 'mean'),
        last_purchase_date=('TransactionDate', 'max')
    ).reset_index()
    
    # Product Preference Features
    product_prefs = (
        df.groupby(['CustomerID', 'Category'])
        .size().unstack(fill_value=0)
        .add_prefix('category_')
    )
    
    # Combine all features
    features = (
        demo_features.merge(txn_features, on='CustomerID')
        .merge(product_prefs, on='CustomerID')
    )
    
    # Add tenure feature
    features['tenure'] = (pd.to_datetime('today') - features['SignupDate']).dt.days
    
    return features

customer_features = create_customer_features(merged_data)

In [4]:
# ----------------------
# 3. Feature Preprocessing
# ----------------------

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['tenure', 'total_spend', 'purchase_freq', 'avg_basket_size']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Region'])
    ],
    remainder='drop'
)

# Fit and transform data
processed_features = preprocessor.fit_transform(customer_features)

# 4. Similarity Model


In [5]:

class LookalikeModel:
    def __init__(self, n_neighbors=5, metric='cosine'):
        self.model = NearestNeighbors(n_neighbors=n_neighbors+1, metric=metric)
        self.customer_ids = None
        
    def fit(self, features, customer_ids):
        self.model.fit(features)
        self.customer_ids = customer_ids
        
    def find_similar(self, query_index, n_recommend=3):
        distances, indices = self.model.kneighbors(
            processed_features[query_index:query_index+1]
        )
        
        # Exclude self
        similar_indices = indices[0][1:n_recommend+1]
        similar_scores = 1 / (1 + distances[0][1:n_recommend+1])  # Convert distance to similarity
        
        return list(zip(self.customer_ids[similar_indices], similar_scores))
    
    def batch_predict(self, target_ids, n_recommend=3):
        results = {}
        for cust_id in target_ids:
            idx = np.where(self.customer_ids == cust_id)[0][0]
            results[cust_id] = self.find_similar(idx, n_recommend)
        return results

# 5. Model Training

In [6]:
# Initialize and train model
lookalike_model = LookalikeModel(n_neighbors=100)
lookalike_model.fit(processed_features, customer_features['CustomerID'].values)

# 6. Generate Recommendations


In [7]:
# Get target customers
target_customers = [f'C{str(i+1).zfill(4)}' for i in range(20)]

# Generate recommendations
recommendations = lookalike_model.batch_predict(target_customers)


# 7. Output Formatting


In [13]:
def format_recommendations(recommendations):
    formatted = []
    for cust_id, matches in recommendations.items():
        record = {
            'CustomerID': cust_id,
            'Lookalike1': matches[0][0],
            'Score1': round(matches[0][1], 4),
            'Lookalike2': matches[1][0],
            'Score2': round(matches[1][1], 4),
            'Lookalike3': matches[2][0],
            'Score3': round(matches[2][1], 4)
        }
        formatted.append(record)
    return pd.DataFrame(formatted)

output_df = format_recommendations(recommendations)

# Save results
output_df.to_csv('Lookalike.csv', index=False)

In [11]:
# 8. Model Validation

def validate_recommendations(cust_id, recommendations, top_n=3):
    """
    Validate recommendations for sample customer
    
    """
    # Get target customer features
    target = customer_features[customer_features.CustomerID == cust_id]
    
    # Get similar customers
    similar_ids = [rec[0] for rec in recommendations[cust_id][:top_n]]
    similars = customer_features[customer_features.CustomerID.isin(similar_ids)]
    
    # Compare features
    comparison = pd.concat([target, similars]).reset_index(drop=True)
    return comparison.set_index('CustomerID').T

print(validate_recommendations('C0001', recommendations))

CustomerID                          C0001                C0152  \
Region                      South America        South America   
SignupDate            2022-07-10 00:00:00  2022-04-19 00:00:00   
total_spend                       3354.52              3385.86   
purchase_freq                           5                    5   
avg_basket_size                       2.4                  2.0   
last_purchase_date    2024-11-02 17:04:16  2024-10-21 06:20:03   
category_Books                          1                    1   
category_Clothing                       0                    0   
category_Electronics                    3                    1   
category_Home Decor                     1                    3   
tenure                                933                 1015   

CustomerID                          C0011                C0174  
Region                      South America        South America  
SignupDate            2022-12-12 00:00:00  2022-11-03 00:00:00  
total_spend 