In [1]:
#imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt

import re, ast
from scipy.sparse import csr_matrix, hstack, vstack, issparse

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('../data/prep.csv')
df.columns

Index(['parent_asin', 'price', 'rating', 'user_id', 'merged_text'], dtype='object')

In [3]:
item_df = (
    df.groupby('parent_asin')
    .agg(
        price = ('price', 'mean'),
        rating = ('rating', 'mean'),
        num_ratings = ('rating', 'count'),
        text = ('merged_text', 'first'),
    )
)

item_df.head()

Unnamed: 0_level_0,price,rating,num_ratings,text
parent_asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7922582,51.0,2.0,1,The Sneetches and Other Stories Too small w...
8288194,14.0,3.0,1,The Creativity Code Video Games PC Gam...
28179714,15.0,4.666667,3,The Autobiography of Miss Jane Pittman and Rel...
60501960,8.0,4.470588,17,Presidents Day Video Games PC Games ...
63052164,20.0,5.0,4,Stranger Planet AUTOGRAPHED SIGNED BOOK Vi...


In [None]:
# tf-idf transformation
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(item_df['text'])
tfidf_matrix.shape

(137249, 5000)

In [5]:
# standard scaling for numeric features
numeric_features = ['price']
scaler = StandardScaler()

numeric_scaled = scaler.fit_transform(item_df[numeric_features])

In [16]:
# Creating matrices
numeric_sparse = csr_matrix(np.nan_to_num(numeric_scaled))
hybrid_matrix = hstack([tfidf_matrix, numeric_sparse])

tfidf_index = item_df.index.to_series().reset_index(drop=True)
hybrid_matrix.shape

(137249, 5001)

In [17]:
tfidf_index

0         0007922582
1         0008288194
2         0028179714
3         0060501960
4         0063052164
             ...    
137244    B0CKPC5CD1
137245    B0CKPNZ88R
137246    B0CKQHY5K4
137247    B0CKQNJB3T
137248    B0CKR257MD
Name: parent_asin, Length: 137249, dtype: object

In [None]:
# receive an user... A user hopefully contains a list of reviews with ratings, or just a thumbs up and down.

# flask POST request gives us a list of parent_asins and the associated ratinsg
# Can't fully read it, but it eems like if the user doesn't rate, its not in the post request.

# test example for now:
user_reviews = [
    {'parent_asin': 'B00002S9MH', 'rating': 5},
    {'parent_asin': 'B00002S9MH', 'rating': 3},
    {'parent_asin': 'B000FH0MHO', 'rating': 4},
]

def return_recommended_itmes(user_reviews,tfidf_index, hybrid_matrix,top_k = 10, top_n=10):
    # two dictionaries: one to store weighted scores, and a total similairty sum, so I can normalize later
    # both are keyed by item_index : val
    scores = {}      
    sim_sums = {}
    
    # For each review the user has given:
    for review in user_reviews:
        parent_asin = review['parent_asin']
        rating = review['rating']
        
        # Double check validity
        if parent_asin in tfidf_index.values:

            # Get the row index of the item in the hybird matrix, then compute cosine similarity
            row_idx = tfidf_index[tfidf_index == parent_asin].index[0]
            sims = cosine_similarity(hybrid_matrix[row_idx], hybrid_matrix)[0]

            # Top_k determines how many similar items to consider for each item the user has rated
            # runtime gets longer for higher k vals, we can disscuss val later
            k = top_k
            
            # Get the indices of the top k similar items
            top_k_idx = np.argpartition(sims, -k)[-k:]
            top_k_idx = top_k_idx[np.argsort(sims[top_k_idx])[::-1]]
            top_k_sims = sims[top_k_idx]

            # calculate those similarity scores
            # formula I use is user rating of current item * similarity score
            for neighbor_idx, sim_val in zip(top_k_idx, top_k_sims):

                # Exclude self-similarity
                if neighbor_idx == row_idx:
                    continue
                weight = rating * sim_val
                scores[neighbor_idx] = scores.get(neighbor_idx, 0) + weight
                sim_sums[neighbor_idx] = sim_sums.get(neighbor_idx, 0) + abs(sim_val)

    # sort our scores, and also normalize. 
    # This normalzation order was given to me by ChatGPT, we can disscuss validly later.
    ranked_scores = {idx: score / sim_sums[idx] for idx, score in scores.items()}
    ranked_items = sorted(ranked_scores.items(), key=lambda x: x[1], reverse=True)

    # Now, just return the top_n items, excluding any the user has already reviewed
    recommended_items = []
    amount_to_return = top_n
    for idx, score in ranked_items[:top_n]:
        if tfidf_index[idx] not in [review['parent_asin'] for review in user_reviews]:
            recommended_items.append((tfidf_index[idx], score))
        else:
            amount_to_return += 1
            
            
            
    
    return recommended_items


# TODO:
# find a way to tie these parent_asins 
# back to the title of the products, maybe need to keep title as part of prep_df

In [19]:
return_recommended_itmes(user_reviews, tfidf_index, hybrid_matrix, top_k=10, top_n=10)

[('B000068WSA', np.float64(4.0)),
 ('B000HWSKNK', np.float64(4.0)),
 ('B00N52DN8Q', np.float64(4.0)),
 ('B001W30G44', np.float64(4.0)),
 ('B0015RCVRM', np.float64(4.0)),
 ('B018K31N68', np.float64(4.0)),
 ('B0001DB6J0', np.float64(4.0)),
 ('B000IN4V6S', np.float64(4.0)),
 ('B00004SV4Y', np.float64(4.0)),
 ('B0030VNLS4', np.float64(4.0))]