In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load datasets
customers = pd.read_csv('/content/drive/MyDrive/ecommerce_eda/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/ecommerce_eda/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/ecommerce_eda/Transactions.csv')

# Convert date columns to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge data for feature engineering
merged_data = transactions.merge(customers, on='CustomerID', how='left').merge(products, on='ProductID', how='left')
print(merged_data)

    TransactionID CustomerID ProductID     TransactionDate  Quantity  \
0          T00001      C0199      P067 2024-08-25 12:38:23         1   
1          T00112      C0146      P067 2024-05-27 22:23:54         1   
2          T00166      C0127      P067 2024-04-25 07:38:55         1   
3          T00272      C0087      P067 2024-03-26 22:55:37         2   
4          T00363      C0070      P067 2024-03-21 15:10:10         3   
..            ...        ...       ...                 ...       ...   
995        T00496      C0118      P037 2024-10-24 08:30:27         1   
996        T00759      C0059      P037 2024-06-04 02:15:24         3   
997        T00922      C0018      P037 2024-04-05 13:05:32         4   
998        T00959      C0115      P037 2024-09-29 10:16:02         2   
999        T00992      C0024      P037 2024-04-21 10:52:24         1   

     TotalValue  Price_x          CustomerName         Region SignupDate  \
0        300.68   300.68        Andrea Jenkins         Euro

In [3]:
# Create customer-level features
customer_features = merged_data.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum',
    'ProductID': 'nunique'
}).rename(columns={'Quantity': 'TotalQuantity', 'TotalValue': 'TotalSpent', 'ProductID': 'UniqueProducts'})

# Normalize features
scaler = StandardScaler()
customer_features_scaled = pd.DataFrame(scaler.fit_transform(customer_features), columns=customer_features.columns, index=customer_features.index)

# Create product preference matrix
product_preference = pd.pivot_table(merged_data, index='CustomerID', columns='Category', values='Quantity', aggfunc='sum', fill_value=0)

In [4]:
# Compute similarity
profile_similarity = cosine_similarity(customer_features_scaled)
transaction_similarity = cosine_similarity(product_preference)

# Combine similarities (assigning equal weights)
final_similarity = 0.5 * profile_similarity + 0.5 * transaction_similarity
final_similarity_df = pd.DataFrame(final_similarity, index=customer_features.index, columns=customer_features.index)

In [5]:
# Generate top 3 recommendations for each customer
top_lookalikes = {}

for customer in final_similarity_df.index[:20]:  # First 20 customers
    scores = final_similarity_df.loc[customer]
    similar_customers = scores.nlargest(4).iloc[1:4]  # Exclude self (highest similarity)
    top_lookalikes[customer] = [(cust, round(score, 4)) for cust, score in similar_customers.items()]

# Convert to Lookalike.csv format
lookalike_data = []

for cust_id, lookalikes in top_lookalikes.items():
    lookalike_list = ', '.join([f'({l[0]}, {l[1]})' for l in lookalikes])
    lookalike_data.append({'Map<cust_id, List<cust_id, score>>': f'<{cust_id}, [{lookalike_list}]>'})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)