In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds

In [2]:
directory = 'data/cleaned_dataset'

csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]
dfs = []
for file in csv_files:
    file_path = os.path.join(directory, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all dataframes into one
df = pd.concat(dfs, ignore_index=True)
print(df.shape)
df.head()

(320579, 8)


Unnamed: 0,UserId,TransactionId,TransactionTime,ItemCode,ItemDescription,NumberOfItemsPurchased,CostPerItem,Country
0,261072,6132192,2018-09-01T14:27:00.000+07:00,445074,PINK HONEYCOMB PAPER FAN,36,0.9,Norway
1,379575,6022456,2018-06-04T13:13:00.000+07:00,478191,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN,36,1.73,United Kingdom
2,274869,6216287,2018-11-13T12:33:00.000+07:00,435225,LUNCH BAG RED RETROSPOT,36,2.28,United Kingdom
3,325500,6335791,2019-01-25T14:41:00.000+07:00,468489,TEA COSY RED STRIPE,18,3.52,United Kingdom
4,347550,6205485,2018-11-04T12:12:00.000+07:00,471051,WATERING CAN BLUE ELEPHANT,6,2.7,United Kingdom


## Data Preprocessing

In [3]:
# Extract data for items
items = df[['ItemCode', 'ItemDescription']].drop_duplicates().reset_index()
print(items.shape)
items.head()

(3625, 3)


Unnamed: 0,index,ItemCode,ItemDescription
0,0,445074,PINK HONEYCOMB PAPER FAN
1,1,478191,CLEAR DRAWER KNOB ACRYLIC EDWARDIAN
2,2,435225,LUNCH BAG RED RETROSPOT
3,3,468489,TEA COSY RED STRIPE
4,4,471051,WATERING CAN BLUE ELEPHANT


In [4]:
# Extract items for user
df_grouped = df.groupby(['UserId', 'ItemCode'], as_index=False)['NumberOfItemsPurchased'].sum()
user_purchases = df_grouped.groupby('UserId').apply(
    lambda x: dict(zip(x['ItemCode'], x['NumberOfItemsPurchased']))
).to_dict()

## Content-based filtering

In [5]:
# TF-IDF Vectorization for Content-Based Filtering
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(items['ItemDescription'])

In [6]:
# Calculate cosine similarity between items
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
def content_based_recommendations(user_id, user_purchases, cosine_sim, top_n=5):
    """
    Generate content-based product recommendations for a given user.

    This function takes a user's past purchases and finds similar products based on 
    a precomputed cosine similarity matrix. It returns a list of recommended item codes.

    Parameters:
    - user_id (int or str): The ID of the user for whom recommendations are generated.
    - user_purchases (dict): A dictionary mapping user IDs to a list of purchased product IDs.
    - cosine_sim (numpy.ndarray): A 2D array representing cosine similarity scores between items.
    - top_n (int, optional): The number of top similar items to consider for each purchased item (default is 3).

    Returns:
    - list: A list of unique ItemCodes recommended for the user.

    Steps:
    1. Retrieve the list of ItemCodes purchased by the user.
    2. For each purchased item:
       - Find its index in the items dataframe.
       - Get its cosine similarity scores with all other items.
       - Sort the scores in descending order to find the most similar items.
       - Select the top `top_n` similar items (excluding itself).
       - Add the recommended ItemCode to the list.
    3. Return a unique set of recommended ItemCode.

    """
    recommended_items = []
    
    for item_code, _ in user_purchases[user_id].items():
        idx = items[items['ItemCode'] == item_code].index[0]  # Find index of the item
        sim_scores = list(enumerate(cosine_sim[idx]))  # Get cosine similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort by similarity
        top_similar = sim_scores[1:top_n+1]  # Skip the first product (itself)

        for i in top_similar:
            item_idx = i[0]
            recommended_items.append(items.iloc[item_idx]['ItemCode'])
    
    return list(set(recommended_items))  # Return unique recommendations

In [8]:
# Test
print(content_based_recommendations(274869, user_purchases, cosine_sim))

[468993, 464898, 444423, 487431, 454671, 442386, 481299, 434196, 485394, 469014, 456729, 448539, 1781787, 440349, 444444, 471072, 462882, 493605, 1779750, 481320, 434217, 469035, 456750, 448560, 440370, 436275, 475188, 1783866, 493626, 442428, 481341, 464961, 452676, 483399, 475209, 1783887, 493647, 442449, 434259, 491610, 475230, 471135, 454755, 489573, 442470, 485478, 469098, 460908, 452718, 448623, 491631, 483441, 475251, 467061, 1783929, 493689, 442491, 434301, 469119, 460929, 456834, 452739, 448644, 491652, 483462, 471177, 454797, 1783950, 1779855, 493710, 469140, 456855, 467103, 1783971, 481446, 438438, 456876, 452781, 1011885, 444591, 483504, 475314, 471219, 450744, 446649, 1783992, 481467, 469182, 456897, 491715, 444612, 483525, 475335, 450765, 1775823, 481488, 434385, 1892562, 465108, 1011927, 1777881, 1773786, 479451, 475356, 454881, 450786, 477414, 1892583, 434406, 1011948, 491757, 315630, 471282, 463092, 450807, 1779960, 786681, 481530, 473340, 1786113, 444675, 483588, 4753

## SVD-Based Collaborative Filtering

In [9]:
# Prepare user-item interaction matrix
user_item_matrix = np.zeros((len(user_purchases), len(items)))

# Map UserID and ItemCode to matrix indices
user_index = {user_id: idx for idx, user_id in enumerate(user_purchases.keys())}
item_index = {item_code: idx for idx, item_code in enumerate(items['ItemCode'])}

# Fill the matrix with purchase counts
for user_id, purchases in user_purchases.items():
    for item_code, count in purchases.items():
        user_item_matrix[user_index[user_id], item_index[item_code]] = count

# Convert matrix to DataFrame for better visualization
user_item_df = pd.DataFrame(user_item_matrix, index=user_index.keys(), columns=items['ItemCode'])

Dimensionality Reduction: The svds() function from SciPy's sparse SVD (scipy.sparse.linalg.svds) computes a low-rank approximation of the original user-item matrix. Instead of using all singular values, we keep only the top k=5 singular values. This reduces the complexity of the model while still capturing the most significant features in the data.

Latent Factors: In a recommendation system, SVD is used to extract latent factors that represent hidden relationships between users and items.

-> Trade-off Between Accuracy and Complexity:

    - A small k keeps the model simple and computationally efficient.

    - A large k captures more information but increases computational cost.

In [10]:
# Perform Singular Value Decomposition (SVD)
U, sigma, Vt = svds(user_item_matrix, k=5)  # Use 5 latent factors
sigma = np.diag(sigma)

# Predict missing values in the user-item interaction matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
print(predicted_ratings)

[[ 2.65312733e-01  1.12952149e+00  6.97713224e+00 ... -2.60088884e-05
   3.93742476e-03  5.37218350e-03]
 [ 1.57454572e-02 -1.18215475e-01  6.27536449e-01 ... -2.21861853e-07
  -1.34607911e-05  5.58334177e-05]
 [ 1.02229382e-01  1.10505939e+00  2.86624061e+00 ... -5.36213087e-06
   2.12855028e-03  2.63966889e-03]
 ...
 [ 6.07792197e-04  2.71342314e-01  4.94860960e-03 ...  1.60058603e-06
   4.50940258e-04  3.68124028e-04]
 [ 1.15916336e+00 -1.78931679e-01  7.05070578e+01 ...  3.13084842e-04
  -5.18777739e-03  1.31591347e-03]
 [ 5.29173920e-02  7.79992091e-01  2.05154678e-01 ... -1.45931638e-05
   7.41191862e-04  1.64516783e-03]]


In [11]:
def svd_recommendations(user_id, user_item_matrix, predicted_ratings, top_n=100):
    user_row = user_index[user_id]
    user_predicted_ratings = predicted_ratings[user_row, :]
    
    # Sort products based on predicted rating, ignoring products the user has already purchased
    purchased_items = set(user_purchases[user_id])
    recommended_items = [
        (item_code, user_predicted_ratings[item_index[item_code]]) 
        for item_code in items['ItemCode'] if item_code not in purchased_items
    ]
    
    recommended_items.sort(key=lambda x: x[1], reverse=True)
    
    # Return top N recommended products
    top_unique_recommendations = []
    unique_recommended_items = set() 
    
    for item, rating in recommended_items:
        if item not in unique_recommended_items:
            top_unique_recommendations.append(item)
            unique_recommended_items.add(item)
        if len(recommended_items) >= top_n:
            break

    return top_unique_recommendations

In [12]:
# Test
print(svd_recommendations(274869, user_item_matrix, predicted_ratings))

[470043]


## Hybrid Recommendation

In [13]:
def hybrid_recommendations(user_id, user_purchases, cosine_sim, user_item_matrix, predicted_ratings, weight_cbf=0.7, weight_svd=0.3, top_n=100):
    # Get Content-Based Filtering recommendations
    cbf_recommendations = content_based_recommendations(user_id, user_purchases, cosine_sim)
    
    # Get SVD-based recommendations
    svd_recommendations_list = svd_recommendations(user_id, user_item_matrix, predicted_ratings)
    
    # Combine the two recommendations (weighted sum)
    combined_recommendations = {}
    
    for product_id in cbf_recommendations:
        combined_recommendations[product_id] = weight_cbf
    
    for product_id in svd_recommendations_list:
        if product_id not in combined_recommendations:
            combined_recommendations[product_id] = weight_svd
        else:
            combined_recommendations[product_id] += weight_svd
    
    # Sort recommendations based on their weights (importance from both methods)
    sorted_recommendations = sorted(combined_recommendations.items(), key=lambda x: x[1], reverse=True)
    return [product for product, _ in sorted_recommendations[:top_n]]


In [14]:
# Test
print(hybrid_recommendations(274869, user_purchases, cosine_sim, user_item_matrix, predicted_ratings))

[470043, 468993, 464898, 444423, 487431, 454671, 442386, 481299, 434196, 485394, 469014, 456729, 448539, 1781787, 440349, 444444, 471072, 462882, 493605, 1779750, 481320, 434217, 469035, 456750, 448560, 440370, 436275, 475188, 1783866, 493626, 442428, 481341, 464961, 452676, 483399, 475209, 1783887, 493647, 442449, 434259, 491610, 475230, 471135, 454755, 489573, 442470, 485478, 469098, 460908, 452718, 448623, 491631, 483441, 475251, 467061, 1783929, 493689, 442491, 434301, 469119, 460929, 456834, 452739, 448644, 491652, 483462, 471177, 454797, 1783950, 1779855, 493710, 469140, 456855, 467103, 1783971, 481446, 438438, 456876, 452781, 1011885, 444591, 483504, 475314, 471219, 450744, 446649, 1783992, 481467, 469182, 456897, 491715, 444612, 483525, 475335, 450765, 1775823, 481488, 434385, 1892562, 465108]


## Evaluation

In [22]:
def evaluate_model_topK(test_data, recommended_products, k=5):
    y_true = []
    y_pred = []
    top_k_hits = 0  # To count how many times we have a relevant product in the top K
    
    for user_id in test_data:
        true_relevant = test_data[user_id]
        recommendations = recommended_products[user_id]
        
        # Check if any of the top K recommended items are in the true relevant items
        relevant_in_top_k = any(item in true_relevant for item in recommendations[:k])
        if relevant_in_top_k:
            top_k_hits += 1
        
        y_true.extend([1 if product in true_relevant else 0 for product in recommendations])
        y_pred.extend([1] * len(recommendations))  # All recommended products are assumed to be positive cases
    
    # Calculate precision, recall, and F1-score
    precision = precision_score(y_true, y_pred,zero_division=0)
    recall = recall_score(y_true, y_pred,zero_division=0)
    f1 = f1_score(y_true, y_pred,zero_division=0)
    
    # Calculate Top-K Accuracy
    top_k_accuracy = top_k_hits / len(test_data)
    
    
    return precision, recall, f1, top_k_accuracy


In [16]:
# Split the user purchase data into train and test
train_data = {}
test_data = {}

for user_id, purchases in user_purchases.items():
    # Split the items into train and test sets, preserving the items and their quantities
    item_codes = list(purchases.keys())  # Get the ItemCode list
    item_quantities = list(purchases.values())  # Get the corresponding quantities

    if len(purchases) > 1:
        # Randomly split the item codes and their quantities into train and test sets
        train_item_codes, test_item_codes, train_quantities, test_quantities = train_test_split(
            item_codes, item_quantities, test_size=0.2, random_state=42
        )
    else:
        train_item_codes, test_item_codes = item_codes, []
        train_quantities, test_quantities = item_quantities, []
    
    # Rebuild the train and test data dictionaries with ItemCode -> NumberOfItemsPurchased
    train_data[user_id] = dict(zip(train_item_codes, train_quantities))
    test_data[user_id] = dict(zip(test_item_codes, test_quantities))

# Evaluate models
hybrid_recommendations_dict = {}
content_based_recommendations_dict = {}
svd_recommendations_dict = {}
for index, user_id in enumerate(train_data):
    if index % 200 == 0 or index == len(train_data) - 1:
        print(f'Processing {index}/{len(train_data)}: user_id={user_id}')
    content_based_recommendations_dict[user_id] = content_based_recommendations(user_id, user_purchases, cosine_sim)
    svd_recommendations_dict[user_id] = svd_recommendations(user_id, user_item_matrix, predicted_ratings)
    hybrid_recommendations_dict[user_id] = hybrid_recommendations(user_id, train_data, cosine_sim, user_item_matrix, predicted_ratings)

Processing 0/4166: user_id=259287
Processing 200/4166: user_id=264558
Processing 400/4166: user_id=270312
Processing 600/4166: user_id=276570
Processing 800/4166: user_id=282828
Processing 1000/4166: user_id=288645
Processing 1200/4166: user_id=294756
Processing 1400/4166: user_id=300783
Processing 1600/4166: user_id=306537
Processing 1800/4166: user_id=312690
Processing 2000/4166: user_id=318654
Processing 2200/4166: user_id=324618
Processing 2400/4166: user_id=330540
Processing 2600/4166: user_id=336651
Processing 2800/4166: user_id=342741
Processing 3000/4166: user_id=348642
Processing 3200/4166: user_id=354522
Processing 3400/4166: user_id=360612
Processing 3600/4166: user_id=366786
Processing 3800/4166: user_id=372519
Processing 4000/4166: user_id=378861
Processing 4165/4166: user_id=384027


In [23]:
# Evaluate 3 approaches
for top_k in [3,5,7,10,20]:
    content_precision, content_recall, content_f1, content_top_k_accuracy = evaluate_model_topK(test_data, content_based_recommendations_dict, k=top_k)
    svd_precision, svd_recall, svd_f1, svd_top_k_accuracy = evaluate_model_topK(test_data, svd_recommendations_dict, k=top_k)
    hybrid_precision, hybrid_recall, hybrid_f1, hybrid_top_k_accuracy = evaluate_model_topK(test_data, hybrid_recommendations_dict, k=top_k)

    # Print results in a nice format
    print(f"Results for Top-{top_k} Recommendations:\n")
    
    # Content-based
    print(f"Content-Based Model:")
    print(f"  Precision: {content_precision:.4f}")
    print(f"  Recall: {content_recall:.4f}")
    print(f"  F1-Score: {content_f1:.4f}")
    print(f"  Top-{top_k} Accuracy: {content_top_k_accuracy:.4f}")
    print()
    
    # SVD-based
    print(f"SVD-based Model:")
    print(f"  Precision: {svd_precision:.4f}")
    print(f"  Recall: {svd_recall:.4f}")
    print(f"  F1-Score: {svd_f1:.4f}")
    print(f"  Top-{top_k} Accuracy: {svd_top_k_accuracy:.4f}")
    print()
    
    # Hybrid model
    print(f"Hybrid Model:")
    print(f"  Precision: {hybrid_precision:.4f}")
    print(f"  Recall: {hybrid_recall:.4f}")
    print(f"  F1-Score: {hybrid_f1:.4f}")
    print(f"  Top-{top_k} Accuracy: {hybrid_top_k_accuracy:.4f}")
    print("="*50)  # Divider for clarity between top-k results

Results for Top-3 Recommendations:

Content-Based Model:
  Precision: 0.0382
  Recall: 1.0000
  F1-Score: 0.0736
  Top-3 Accuracy: 0.0982

SVD-based Model:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  Top-3 Accuracy: 0.0000

Hybrid Model:
  Precision: 0.0316
  Recall: 1.0000
  F1-Score: 0.0612
  Top-3 Accuracy: 0.0807
Results for Top-5 Recommendations:

Content-Based Model:
  Precision: 0.0382
  Recall: 1.0000
  F1-Score: 0.0736
  Top-5 Accuracy: 0.1517

SVD-based Model:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  Top-5 Accuracy: 0.0000

Hybrid Model:
  Precision: 0.0316
  Recall: 1.0000
  F1-Score: 0.0612
  Top-5 Accuracy: 0.1349
Results for Top-7 Recommendations:

Content-Based Model:
  Precision: 0.0382
  Recall: 1.0000
  F1-Score: 0.0736
  Top-7 Accuracy: 0.2028

SVD-based Model:
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
  Top-7 Accuracy: 0.0000

Hybrid Model:
  Precision: 0.0316
  Recall: 1.0000
  F1-Score: 0.0612
  Top-7 Accuracy: 0.1812
R