# Task 2: To find the lookalike model of a person in a given dataset based on profile and transaction history.

### Import Libraries

In [8]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

### Data Handling and Reading

In [9]:
# Load data
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Convert dates to datetime for proper handling
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

### Declaring and defning all the functions like extracting custimer features, similarity score calculation, etc.

In [10]:
# Function to create customer features
def extract_customer_features(customers, transactions, products):
    """
    Create meaningful features for customers based on their behavior and purchases.
    """
    # Calculate customer lifetime value
    customer_ltv = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
    
    # Analyze purchase patterns
    purchase_frequency = transactions.groupby('CustomerID').size().reset_index(name='total_transactions')
    
    # Study category preferences
    category_pivot = transactions.merge(products[['ProductID', 'Category']], on='ProductID')
    category_preferences = pd.crosstab(category_pivot['CustomerID'], category_pivot['Category'])
    
    # Examine average order value
    avg_order = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index(name='avg_order_value')
    
    # Calculate customer age (engagement duration)
    latest_date = transactions['TransactionDate'].max()
    customers['account_age_days'] = (latest_date - customers['SignupDate']).dt.days
    
    # Create region encoding
    region_encoding = pd.get_dummies(customers['Region'], prefix='region')
    
    # Merge all features
    features = customers[['CustomerID', 'account_age_days']]
    features = features.merge(customer_ltv, on='CustomerID', how='left')
    features = features.merge(purchase_frequency, on='CustomerID', how='left')
    features = features.merge(avg_order, on='CustomerID', how='left')
    features = features.merge(category_preferences, left_on='CustomerID', right_index=True, how='left')
    features = pd.concat([features, region_encoding], axis=1)
    
    # Fill missing values with 0 (for customers with no transactions)
    features = features.fillna(0)
    
    return features


# Function to calculate similarity scores
def calculate_similarity_scores(features, customer_ids):
    """
    Calculate similarity scores for the given customers based on their features.
    """
    # Separate identifier from features
    feature_cols = [col for col in features.columns if col != 'CustomerID']
    
    # Scale features to ensure equal weighting
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features[feature_cols])
    
    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(scaled_features)
    
    # Create a DataFrame for easier lookup
    similarity_df = pd.DataFrame(similarity_matrix, 
                                  index=features['CustomerID'],
                                  columns=features['CustomerID'])
    
    # Find top 3 similar customers for each target customer
    lookalikes = {}
    for cust_id in customer_ids:
        # Get similarities for this customer
        similarities = similarity_df[cust_id].sort_values(ascending=False)
        # Remove self-similarity
        similarities = similarities[similarities.index != cust_id]
        # Get top 3
        top_3 = similarities.head(3)
        # Store results with rounded scores
        lookalikes[cust_id] = [(idx, round(score, 4)) for idx, score in top_3.items()]
    
    return lookalikes


# Function to save results as a CSV
def create_lookalike_csv(lookalikes):
    """
    Save the lookalike results to a CSV file.
    """
    # Prepare data for CSV
    rows = []
    for cust_id, matches in lookalikes.items():
        # Format the matches as a string
        matches_str = ';'.join([f"{match[0]}:{match[1]}" for match in matches])
        rows.append({'CustomerID': cust_id, 'Lookalikes': matches_str})
    
    # Create and save DataFrame
    result_df = pd.DataFrame(rows)
    result_df.to_csv('Lookalike.csv', index=False)
    print("Lookalike results saved to Lookalike.csv")


### Executing the functions and getting the results.

In [12]:
# Extract meaningful features for customers
features = extract_customer_features(customers, transactions, products)

# Define target customer IDs (C0001-C0020)
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]

# Calculate similarity scores for the target customers
lookalikes = calculate_similarity_scores(features, target_customers)

# Save results to a CSV file
create_lookalike_csv(lookalikes)

# Display results for verification
for cust_id, matches in lookalikes.items():
    print(f"\nCustomer {cust_id} is most similar to:")
    for match_id, score in matches:
        print(f"  {match_id} (similarity: {score})")


Lookalike results saved to Lookalike.csv

Customer C0001 is most similar to:
  C0120 (similarity: 0.8367)
  C0091 (similarity: 0.8309)
  C0112 (similarity: 0.8266)

Customer C0002 is most similar to:
  C0134 (similarity: 0.9509)
  C0106 (similarity: 0.9224)
  C0159 (similarity: 0.9093)

Customer C0003 is most similar to:
  C0031 (similarity: 0.9507)
  C0129 (similarity: 0.9075)
  C0195 (similarity: 0.8178)

Customer C0004 is most similar to:
  C0113 (similarity: 0.9246)
  C0104 (similarity: 0.8574)
  C0012 (similarity: 0.7455)

Customer C0005 is most similar to:
  C0007 (similarity: 0.967)
  C0140 (similarity: 0.8473)
  C0186 (similarity: 0.8205)

Customer C0006 is most similar to:
  C0187 (similarity: 0.8909)
  C0085 (similarity: 0.707)
  C0171 (similarity: 0.7026)

Customer C0007 is most similar to:
  C0005 (similarity: 0.967)
  C0140 (similarity: 0.8774)
  C0159 (similarity: 0.7385)

Customer C0008 is most similar to:
  C0098 (similarity: 0.816)
  C0194 (similarity: 0.8053)
  C0059 