In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")


In [3]:
merged = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID')


In [4]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['Tenure'] = 2024 - customers['SignupDate'].dt.year


In [5]:
customers = pd.get_dummies(customers, columns=['Region'], prefix='Region')


In [6]:
category_spending = pd.pivot_table(
    merged,
    index='CustomerID',
    columns='Category',
    values='TotalValue',
    aggfunc='sum',
    fill_value=0
).add_prefix('SpentOn_')


transaction_metrics = merged.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    AvgTransactionValue=('TotalValue', 'mean'),
    TransactionCount=('TransactionID', 'count')
)


In [7]:
final_data = (
    customers.set_index('CustomerID')
    .join(transaction_metrics)
    .join(category_spending)
    .fillna(0)
)


feature_columns = [
    'Tenure',
    'Region_Asia',
    'Region_Europe',
    'Region_North America',
    'Region_South America',
    'TotalSpent',
    'AvgTransactionValue',
    'TransactionCount',
    'SpentOn_Books',
    'SpentOn_Clothing',
    'SpentOn_Electronics',
    'SpentOn_Home Decor'
]


In [8]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(final_data[feature_columns])


In [9]:
similarity_matrix = cosine_similarity(scaled_features)


In [10]:

lookalikes = {}
target_customers = [f"C00{i:02d}" for i in range(1, 21)]

for idx, cust_id in enumerate(target_customers):
    if cust_id not in final_data.index:
        continue

    sim_scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_matches = [
        (final_data.index[i], round(score, 2)) 
        for i, score in sorted_scores 
        if final_data.index[i] != cust_id
    ][:3]
    
    lookalikes[cust_id] = top_matches


In [11]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Lookalikes': [str(matches) for matches in lookalikes.values()]
})

In [13]:
lookalike_df.to_csv("Lookalike.csv", index=False)