In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

def create_customer_features(customers_df, transactions_df, products_df):
    customer_transactions = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean', 'std'],
        'Quantity': ['sum', 'mean'],
        'Price': ['mean', 'max']
    }).fillna(0)
    
    customer_transactions.columns = ['_'.join(col).strip() for col in customer_transactions.columns.values]
    
    trans_products = pd.merge(transactions_df, products_df, on='ProductID')
    category_pivot = pd.crosstab(trans_products['CustomerID'], trans_products['Category'])
    category_pivot = category_pivot.div(category_pivot.sum(axis=1), axis=0)
    
    customer_features = pd.merge(
        customer_transactions, 
        category_pivot,
        left_index=True,
        right_index=True,
        how='left'
    ).fillna(0)
    
    return customer_features

customer_features = create_customer_features(customers_df, transactions_df, products_df)
scaler = StandardScaler()
feature_matrix = scaler.fit_transform(customer_features)
feature_matrix = pd.DataFrame(feature_matrix, index=customer_features.index, columns=customer_features.columns)

similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features.index, columns=customer_features.index)

output_data = []
first_20_customers = customers_df['CustomerID'][:20]

for customer_id in first_20_customers:
    similarities = similarity_df[customer_id].drop(customer_id)
    top_3 = similarities.nlargest(3)
    
    lookalikes = []
    for similar_id, score in top_3.items():
        lookalikes.append(f"{similar_id}:{score:.4f}")
    
    lookalike_str = ", ".join(lookalikes)
    
    output_data.append({
        'CustomerID': customer_id,
        'Lookalikes': lookalike_str
    })

output_df = pd.DataFrame(output_data)
output_df.to_csv('Abhishek_Lookalike.csv', index=False)

print("First few rows of the output CSV:")
print(output_df.head())

First few rows of the output CSV:
  CustomerID                                Lookalikes
0      C0001  C0069:0.9105, C0035:0.8294, C0005:0.7618
1      C0002  C0062:0.8487, C0031:0.8239, C0025:0.7405
2      C0003  C0144:0.8728, C0166:0.7875, C0134:0.7380
3      C0004  C0017:0.9363, C0075:0.9310, C0122:0.8083
4      C0005  C0007:0.8989, C0069:0.8679, C0199:0.8618
