In [2]:
#imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt

import re, ast
from scipy.sparse import csr_matrix, hstack, vstack, issparse

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

Matplotlib is building the font cache; this may take a moment.


In [3]:
df = pd.read_csv('../data/prep.csv')
print(f" Loaded! Shape: {df.shape}")
print(f" Columns: {df.columns.tolist()}")
print(f" Unique books: {df['parent_asin'].nunique():,}")
print(f" Unique users: {df['user_id'].nunique():,}")

 Loaded! Shape: (4624615, 9)
 Columns: ['parent_asin', 'price', 'product_title', 'categories', 'rating', 'user_id', 'review_title', 'text', 'merged_text']
 Unique books: 137,249
 Unique users: 2,766,656


In [4]:
def extract_unique_items(df):

    print("Extracting unique items and building metadata table...\n")

    # Validate required columns
    required_columns = ['parent_asin', 'price', 'rating', 'merged_text']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    # Aggregate at the item level
    item_df = (
        df.groupby('parent_asin')
        .agg(
            price = ('price', 'first'),
            avg_rating = ('rating', 'mean'),
            num_ratings = ('rating', 'count'),
            text = ('merged_text', 'first'),
        )
    ).reset_index()

    # Display summary and preview
    print(f"Extracted {item_df.shape[0]:,} unique items.")
    print("Preview of item metadata:")
    print(item_df.head())

    return item_df

In [5]:
item_df = extract_unique_items(df)

Extracting unique items and building metadata table...

Extracted 137,249 unique items.
Preview of item metadata:
  parent_asin  price  avg_rating  num_ratings  \
0  0007922582     51    2.000000            1   
1  0008288194     14    3.000000            1   
2  0028179714     15    4.666667            3   
3  0060501960      8    4.470588           17   
4  0063052164     20    5.000000            4   

                                                text  
0  The Sneetches and Other Stories [] Too small w...  
1  The Creativity Code ['Video Games', 'PC', 'Gam...  
2  The Autobiography of Miss Jane Pittman and Rel...  
3  Presidents' Day ['Video Games', 'PC', 'Games']...  
4  Stranger Planet AUTOGRAPHED / SIGNED BOOK ['Vi...  


In [6]:
def build_item_representation(df, max_features=10000, min_df=2, ngram_range=(1,2)):

    print("Building hybrid item representations (text + numeric)...\n")

    # TF-IDF vectorization for textual metadata
    tfidf = TfidfVectorizer(max_features=max_features, min_df=min_df, ngram_range=ngram_range, stop_words='english')
    tfidf_matrix = tfidf.fit_transform(item_df['text'])

    # Select and normalize numeric features
    numeric_features = ['price', 'avg_rating', 'num_ratings']
    scaler = StandardScaler()

    # Convert to numeric safely
    numeric_data = df[numeric_features].apply(pd.to_numeric, errors="coerce").fillna(0)
    numeric_scaled = scaler.fit_transform(numeric_data)
    print(f"Numeric features scaled (columns: {numeric_features})")

    # Convert to sparse format for concatenation
    numeric_sparse = np.nan_to_num(numeric_scaled)

    # Concatenate text and numeric representations
    hybrid_matrix = hstack([tfidf_matrix, numeric_sparse]).tocsr()
    print(f"Final hybrid matrix shape: {hybrid_matrix.shape}")

    # Maintain item lookup for interpretation
    tfidf_index = df["parent_asin"].reset_index(drop=True)
    print("Created lookup table linking vectors to item parent_asin.\n")
    print(tfidf_index.head())

    return hybrid_matrix, tfidf, tfidf_index

In [7]:
hybrid_matrix, tfidf, tfidf_index = build_item_representation(item_df)

Building hybrid item representations (text + numeric)...

Numeric features scaled (columns: ['price', 'avg_rating', 'num_ratings'])
Final hybrid matrix shape: (137249, 10003)
Created lookup table linking vectors to item parent_asin.

0    0007922582
1    0008288194
2    0028179714
3    0060501960
4    0063052164
Name: parent_asin, dtype: object


In [None]:
# receive an user... A user hopefully contains a list of reviews with ratings, or just a thumbs up and down.

# flask POST request gives us a list of parent_asins and the associated ratinsg
# Can't fully read it, but it eems like if the user doesn't rate, its not in the post request.

# test example for now:
user_reviews = [
    {'parent_asin': 'B00069EVOG', 'rating': 5},
    {'parent_asin': 'B00002S9MH', 'rating': 3},
    {'parent_asin': 'B000FH0MHO', 'rating': 4},
]

def return_recommended_items(user_reviews, tfidf_index, hybrid_matrix, top_k = 10, top_n=10):
    # two dictionaries: one to store weighted scores, and a total similairty sum, so I can normalize later
    # both are keyed by item_index : val
    scores = {}      
    sim_sums = {}
    
    # For each review the user has given:
    for review in user_reviews:
        parent_asin = review['parent_asin']
        rating = review['rating']
        
        # Double check validity
        if parent_asin in tfidf_index.values:

            # Get the row index of the item in the hybird matrix, then compute cosine similarity
            row_idx = tfidf_index[tfidf_index == parent_asin].index[0]
            sims = cosine_similarity(hybrid_matrix[row_idx], hybrid_matrix)[0]

            # Top_k determines how many similar items to consider for each item the user has rated
            # runtime gets longer for higher k vals, we can disscuss val later
            k = top_k
            
            # Get the indices of the top k similar items
            top_k_idx = np.argpartition(sims, -k)[-k:]
            top_k_idx = top_k_idx[np.argsort(sims[top_k_idx])[::-1]]
            top_k_sims = sims[top_k_idx]


            # calculate those similarity scores
            # formula I use is user rating of current item * similarity score
            for neighbor_idx, sim_val in zip(top_k_idx, top_k_sims):

                # Exclude self-similarity
                if neighbor_idx == row_idx:
                    continue
                weight = rating * sim_val
                scores[neighbor_idx] = scores.get(neighbor_idx, 0) + weight
                sim_sums[neighbor_idx] = sim_sums.get(neighbor_idx, 0) + abs(sim_val)

    # sort our scores, and also normalize. 
    # This normalzation order was given to me by ChatGPT, we can disscuss validly later.
    ranked_scores = {idx: score / sim_sums[idx] for idx, score in scores.items()}
    ranked_items = sorted(ranked_scores.items(), key=lambda x: x[1], reverse=True)

    # Now, just return the top_n items, excluding any the user has already reviewed
    recommended_items = []
    amount_to_return = top_n
    for idx, score in ranked_items[:top_n]:
        if tfidf_index[idx] not in [review['parent_asin'] for review in user_reviews]:
            recommended_items.append((tfidf_index[idx], float(score)))
        else:
            amount_to_return += 1
            
    return recommended_items


# TODO:
# find a way to tie these parent_asins 
# back to the title of the products, maybe need to keep title as part of prep_df

# TODO:prep
# create a user embedding and see if thats better. 
'''
user_vector = np.zeros(embedding_dim)
total_weight = 0

for review in user_reviews:
    idx = tfidf_index[tfidf_index == review['parent_asin']].index[0]
    item_vec = hybrid_matrix[idx]
    weight = review['rating']
    
    user_vector += weight * item_vec
    total_weight += weight

user_vector /= total_weight
sims = cosine_similarity(user_vector.reshape(1, -1), hybrid_matrix)[0]
'''

"\nuser_vector = np.zeros(embedding_dim)\ntotal_weight = 0\n\nfor review in user_reviews:\n    idx = tfidf_index[tfidf_index == review['parent_asin']].index[0]\n    item_vec = hybrid_matrix[idx]\n    weight = review['rating']\n\n    user_vector += weight * item_vec\n    total_weight += weight\n\nuser_vector /= total_weight\nsims = cosine_similarity(user_vector.reshape(1, -1), hybrid_matrix)[0]\n"

In [9]:
return_recommended_items(user_reviews, tfidf_index, hybrid_matrix, top_k=10, top_n=10)

[('B0002MHF2C', 5.0),
 ('B005CWFVJQ', 5.0),
 ('B00005OUIW', 5.0),
 ('B09ZXZH48T', 5.0),
 ('B01LYST149', 5.0),
 ('B0000E2XEG', 5.0),
 ('B0009QVE6O', 5.0),
 ('B000VJG6VM', 5.0),
 ('B0085Z8V5I', 5.0),
 ('4591153320', 4.0)]

In [10]:
user_reviews = {'parent_asin': 'B007X47EK0', 'rating': 5, 'parent_asin': 'B004A7FOIC', 'rating': 5},
recommended_items = return_recommended_items(user_reviews, tfidf_index, hybrid_matrix, top_k=10, top_n=10)

In [11]:
def get_items_details(recommended_items, item_df):
    # Convert recommended items to DataFrame
    rec_df = pd.DataFrame(recommended_items, columns=['parent_asin', 'score'])
    
    # Merge with item_df to get details
    detailed_rec_df = rec_df.merge(item_df, on='parent_asin', how='left')
    
    return detailed_rec_df

detailed_recommendations = get_items_details(recommended_items, item_df)
print(detailed_recommendations)

  parent_asin  score  price  avg_rating  num_ratings  \
0  B07NVTR5FD    5.0     22    4.333333           57   
1  B07HM9N443    5.0     19    4.352941           17   
2  B076CZJWWW    5.0     20    5.000000            1   
3  B000PDY2JW    5.0     28    5.000000            1   
4  B000VWRH80    5.0     27    5.000000            1   
5  B078QC63NX    5.0     24    4.513514           37   
6  B01M63WQUQ    5.0     25    5.000000            2   
7  B018LDDZN6    5.0     25    5.000000            1   
8  B018JPBOG6    5.0     28    5.000000            4   

                                                text  
0  Star Wars Battlefront II - Xbox One ['Video Ga...  
1  Star Wars Battlefront - Xbox (Renewed) ['Video...  
2  Star Wars Battlefront II 2100 Crystals - Xbox ...  
3  Star Wars Battlefront - Xbox ['Video Games', '...  
4  Star Wars Battlefront: Renegade Squadron (PSP)...  
5  Star Wars Battlefront II PS4 ['Video Games', '...  
6  EA Star Wars Battlefront (Xbox One) with Exclu...  