In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

# Preprocess the datasets
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets: Transactions with Products and Customers
transactions_products = transactions.merge(products, on='ProductID', how='left')
data_merged = transactions_products.merge(customers, on='CustomerID', how='left')

# Generate features for Lookalike Model
customer_features = data_merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count',  # Number of transactions
    'Quantity': 'sum',  # Total items purchased
    'Category': lambda x: x.mode()[0]  # Most purchased category
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'TransactionCount',
    'Quantity': 'TotalItemsPurchased',
    'Category': 'PreferredCategory'
}).reset_index()

# Normalize numerical features
numerical_features = ['TotalSpending', 'TransactionCount', 'TotalItemsPurchased']
customer_features_normalized = customer_features.copy()
customer_features_normalized[numerical_features] = (
    customer_features[numerical_features] - customer_features[numerical_features].mean()
) / customer_features[numerical_features].std()

# Encode categorical features (PreferredCategory)
customer_features_encoded = pd.get_dummies(customer_features_normalized, columns=['PreferredCategory'])

# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features_encoded.drop(columns=['CustomerID']))
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Generate Lookalike results for the first 20 customers
lookalike_results = {}
for customer_id in customer_features['CustomerID'][:20]:
    # Get the top 3 most similar customers (excluding self)
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert results to DataFrame
lookalike_output = []
for cust_id, similarities in lookalike_results.items():
    for similar_cust_id, score in similarities:
        lookalike_output.append({
            'CustomerID': cust_id,
            'SimilarCustomerID': similar_cust_id,
            'SimilarityScore': score
        })

lookalike_df = pd.DataFrame(lookalike_output)

# Save results to CSV
lookalike_csv_path = 'FirstName_LastName_Lookalike.csv'
lookalike_df.to_csv(lookalike_csv_path, index=False)

print(f"Lookalike results saved to {lookalike_csv_path}")


Lookalike results saved to FirstName_LastName_Lookalike.csv
