In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [5]:
# Loading 7 Reading datasets
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')
customers = pd.read_csv('Customers.csv')
products

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.30
1,P002,ActiveWear Smartwatch,Electronics,346.30
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31
...,...,...,...,...
95,P096,SoundWave Headphones,Electronics,307.47
96,P097,BookWorld Cookbook,Books,319.34
97,P098,SoundWave Laptop,Electronics,299.93
98,P099,SoundWave Mystery Book,Books,354.29


# Data Preprocessing
### Merging datasets for enriched analysis

In [23]:


merged_data = pd.merge(transactions, products, on='ProductID', how='left')
merged_data = pd.merge(merged_data, customers, on='CustomerID', how='left')

# Feature Engineering part

###  Creating customer profiles by aggregating transaction and product data

In [16]:

customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'count',  # Count of unique products bought
    'Category': lambda x: ','.join(x.unique())  # Concatenating unique categories
}).reset_index()

# Calculating average price per transaction 
customer_profiles['AveragePrice'] = customer_profiles['TotalValue'] / customer_profiles['Quantity']

# Encoding Categorical data
customer_profiles = pd.get_dummies(customer_profiles, columns=['Category'], prefix='', prefix_sep='')

# Standardizing numerical features 

In [17]:

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_profiles.drop(['CustomerID'], axis=1))

# Computing Similarity Matrix

In [18]:

similarity_matrix = cosine_similarity(scaled_features)

In [19]:
# Maping Customer IDs to indices for lookup
customer_id_to_index = {customer_id: idx for idx, customer_id in enumerate(customer_profiles['CustomerID'])}
index_to_customer_id = {idx: customer_id for customer_id, idx in customer_id_to_index.items()}

# Generating Lookalike Recommendations

In [20]:
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:  # First 20 customers(C0001-C0020)
    if customer_id in customer_id_to_index:
        customer_idx = customer_id_to_index[customer_id]
        similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
        similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

        # Get top 3 similar customers excluding the customer itself
        top_similars = [
            (index_to_customer_id[idx], score)
            for idx, score in similarity_scores[1:4]  # Skip the first (self-similarity)
        ]

        lookalike_map[customer_id] = top_similars

# Saving the Lookalike Map to CSV file

In [21]:

lookalike_list = []
for cust_id, similars in lookalike_map.items():
    for similar_cust_id, score in similars:
        lookalike_list.append({
            'CustomerID': cust_id,
            'SimilarCustomerID': similar_cust_id,
            'SimilarityScore': score
        })

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Summary of the model logic and implementation
# 1. Customer profiles are created by aggregating transaction data and encoding product categories.
# 2. Numerical features are standardized for unbiased similarity calculations.
# 3. Cosine similarity is used to identify similar customers based on their profiles.
# 4. Top 3 similar customers are recommended for the first 20 customers (C0001-C0020).

In [22]:
# Making Lookalike Map
def display_lookalike_map():
    print("Lookalike Map: CustomerID -> List of (SimilarCustomerID, SimilarityScore)")
    for cust_id, similars in lookalike_map.items():
        print(f"{cust_id}: {similars}")

display_lookalike_map()

Lookalike Map: CustomerID -> List of (SimilarCustomerID, SimilarityScore)
C0001: [('C0064', np.float64(0.9940102407550767)), ('C0018', np.float64(0.972669967845418)), ('C0119', np.float64(0.9432510245409608))]
C0002: [('C0062', np.float64(0.995441876038633)), ('C0144', np.float64(0.9879866705716347)), ('C0159', np.float64(0.9847040148612596))]
C0003: [('C0031', np.float64(0.9929580952007752)), ('C0026', np.float64(0.9735282434433666)), ('C0149', np.float64(0.965058188707725))]
C0004: [('C0065', np.float64(0.9653495192841488)), ('C0018', np.float64(0.9584286072475859)), ('C0064', np.float64(0.9062555006090786))]
C0005: [('C0197', np.float64(0.9915238390463919)), ('C0163', np.float64(0.9282015921704501)), ('C0080', np.float64(0.07153122007589155))]
C0006: [('C0079', np.float64(0.9999876034472264)), ('C0135', np.float64(0.9893587558577273)), ('C0187', np.float64(0.9856334087795102))]
C0007: [('C0140', np.float64(0.9881078037379301)), ('C0069', np.float64(0.9858371250899015)), ('C0095', np