# NIRS: Neural-based recommender system with a focus on interpretability

In [40]:
import pandas as pd
import numpy as np
import gzip
import json
import random
import sklearn
import torch


def seed_everything(seed=42):
    # Seed the random number generator
    random.seed(seed)

    # Seed NumPy
    np.random.seed(seed)

    # Seed scikit-learn
    sklearn.utils.check_random_state(seed)

    # Seed PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    # Set pandas options
    pd.set_option('display.max_columns', None)  # Display all columns in pandas DataFrames
    pd.set_option('display.max_rows', None)  # Display all rows in pandas DataFrames
    pd.set_option('display.width', None)  # Disable column width restriction
    pd.set_option('display.expand_frame_repr', False)  # Prevent line wrapping in pandas DataFrames


seed_everything(seed=42)

## Setup data

In [41]:
# read the dataset from json
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l.strip())
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
    
  return pd.DataFrame.from_dict(df, orient='index')


def sample_data(reviews_df, products_df, min_reviews_count=10, max_users=1000, frac_sampled_products=0.1):
    # Sample a subset of users based on the number of reviews they have
    user_reviews_count = reviews_df['reviewerID'].value_counts()
    selected_users = user_reviews_count[user_reviews_count >= min_reviews_count].index[:max_users]
    reviews_subset: pd.DataFrame = reviews_df[reviews_df['reviewerID'].isin(selected_users)]

    # Sample a subset of products based on popularity or ratings
    # You can use salesRank or overall ratings for this purpose
    sampled_products: pd.DataFrame = products_df.sample(frac=frac_sampled_products, random_state=42)

    return reviews_subset, sampled_products

def count_nan_values(df):
    nan_counts = df.isna().sum()
    return nan_counts[nan_counts > 0]

def count_empty_strings(df):
    empty_string_counts = (df == '').sum()
    return empty_string_counts[empty_string_counts > 0]

In [42]:
df_reviews = getDF('data/Office_Products_5.json.gz').drop(['verified', 'reviewTime', 'style', 'image', 'vote'], axis=1)

In [43]:
df_reviews.shape

(800357, 7)

In [44]:
df_products = getDF('data/meta_Office_Products.json.gz').drop(["imageURL", "imageURLHighRes", 'tech1', 'tech2', 'fit'], axis=1)

In [45]:
count_nan_values(df_reviews)

reviewerName    140
reviewText      213
summary         129
dtype: int64

In [46]:
df_products.head()

Unnamed: 0,category,description,title,also_buy,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,details
0,"[Office Products, Office & School Supplies, Ed...",[Sequential Spelling is based on the classic O...,Sequential Spelling Level 1 Bundle with Studen...,[],STL Distributors,[],"[>#439,654 in Office Products (See top 100), >...","[1935943065, 1935943073, B00IJH9Q4M, 002115021...",Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","August 15, 2014",$32.90,12624861,
1,"[Office Products, Office &amp; School Supplies...","[Unusual book, , ]","Mathematics, Applications and Concepts, Course...",[],bailey,[],"3,839,628 in Books (",[],Books,,,$8.62,78652669,
2,[],[Pearson MyHistoryLab Online Access Code for A...,Pearson MyHistoryLab Online Access Code for Am...,[],Pearson MyHistoryLab,[Pearson MyHistoryLab Online Access Code for A...,"[>#1,925,354 in Office Products (See top 100)]",[],Office Products,,"June 21, 2012",$0.99,136039847,
3,"[Office Products, Office & School Supplies, Ed...",[Corduroy the bear goes to the launderette wit...,A Pocket for Corduroy,"[0140501738, 0448421917, 0670063428, 042528875...",Ingram Book & Distributor,[9780140503524],"[>#422,894 in Office Products (See top 100), >...",[0140501738],Office Products,,"September 14, 2006",$0.95,140503528,
4,"[Office Products, Office & School Supplies, Ed...","[<div class=""aplus""> <div class=""leftImage"" st...",Social Entrepreneurship: What Everyone Needs t...,"[0195334760, 1613630328, 1422104060, 158648956...",Visit Amazon's David Bornstein Page,[],"110,732 in Books (","[0195334760, 1586489569, 1613630328, 142210406...",Books,,,,195396332,


In [47]:
count_nan_values(df_products)

details    7147
dtype: int64

In [48]:
count_empty_strings(df_products)

title                3
brand             4865
main_cat          1983
similar_item    194977
date             39165
price           142745
dtype: int64

In [49]:
df_products.shape

(315458, 14)

In [50]:
df_reviews_sampled, df_products_sampled = sample_data(df_reviews, df_products, min_reviews_count=10, frac_sampled_products=0.1)

In [51]:
print(f'Shape of the sampled reviews dataset: {df_reviews_sampled.shape}')
print(f'Shape of the sampled products dataset: {df_products_sampled.shape}')

Shape of the sampled reviews dataset: (45779, 7)
Shape of the sampled products dataset: (31546, 14)


In [52]:
def save_sampled_data(reviews_df, products_df, reviews_file, products_file):
    reviews_df.to_csv(reviews_file, index=False)
    products_df.to_csv(products_file, index=False)

save_sampled_data(df_reviews_sampled, df_products_sampled, 'data/reviews_sampled.csv', 'data/products_sampled.csv')

In [53]:
# test reading from the csv
df_reviews_sampled = pd.read_csv('data/reviews_sampled.csv')
df_products_sampled = pd.read_csv('data/products_sampled.csv')

In [16]:
from bs4 import BeautifulSoup


html = df_products.iloc[0]['tech2']
print(html)

if html is not None and html != '' and html != '[]' and html != '{}' \
    and html != 'No' and html != 'no' and html != 'N/A' and html != 'none' \
    and html != 'None' and html != 'NA' and html != 'na':
    
    # Parse the HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Find all table rows ('tr' elements)
    rows = soup.find_all('tr')

    # Extract the key-value pairs
    data = {}
    for row in rows:
        th = row.find('th')  # Find the table header ('th') element
        td = row.find('td')  # Find the table data ('td') element
        
        if th and td:
            key = th.text.strip()  # Extract the key and remove leading/trailing whitespace
            value = td.text.strip()  # Extract the value and remove leading/trailing whitespace
            data[key] = value  # Store the key-value pair in the data dictionary

    # Print the extracted data
    for key, value in data.items():
        print(f'{key}: {value}')

nan


TypeError: object of type 'float' has no len()

In [76]:
df_products_filtered = df_products[df_products['main_cat'] == 'Office Products']
df_products_filtered.head()

Unnamed: 0,category,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,details
0,"[Office Products, Office & School Supplies, Ed...",[Sequential Spelling is based on the classic O...,,Sequential Spelling Level 1 Bundle with Studen...,[],,STL Distributors,[],"[>#439,654 in Office Products (See top 100), >...","[1935943065, 1935943073, B00IJH9Q4M, 002115021...",Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","August 15, 2014",$32.90,12624861,
2,[],[Pearson MyHistoryLab Online Access Code for A...,,Pearson MyHistoryLab Online Access Code for Am...,[],,Pearson MyHistoryLab,[Pearson MyHistoryLab Online Access Code for A...,"[>#1,925,354 in Office Products (See top 100)]",[],Office Products,,"June 21, 2012",$0.99,136039847,
3,"[Office Products, Office & School Supplies, Ed...",[Corduroy the bear goes to the launderette wit...,,A Pocket for Corduroy,"[0140501738, 0448421917, 0670063428, 042528875...",,Ingram Book & Distributor,[9780140503524],"[>#422,894 in Office Products (See top 100), >...",[0140501738],Office Products,,"September 14, 2006",$0.95,140503528,
5,"[Office Products, Office & School Supplies, Bo...",[A good helper to help you record your reading...,,EKLOEN Mixed Designs of Antiqued Bronze Colour...,"[B00BLY6POE, B00G8WV5U8, 0307591662, B019XJZHQ...",,EKLOEN,"[It is not only the bookmarks, but also art, I...","[>#43,748 in Office Products (See top 100), >#...",[],Office Products,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 2015",$17.50,245109919,
23,"[Office Products, Office & School Supplies, Fo...",[Unique DesignNovel Appearance<br> Electroplat...,,Jzcky Shzrp Unique Appearance Premium Quality ...,[],,Jzcky Shzrp,"[Unique Appearance Design, Material:Eco-friend...","[>#835,737 in Office Products (See top 100), >...",[],Office Products,,"February 19, 2016",,357051378,


## Analyze data

In [18]:
# Load the user reviews DataFrame
reviews_df = pd.read_csv('user_reviews.csv')

# Load the products dataset DataFrame
products_df = pd.read_csv('products.csv')

def get_user_reviews(user_id):
    # Filter the reviews DataFrame for a specific user
    user_reviews = reviews_df[reviews_df['user_id'] == user_id]

    # Join the reviews DataFrame with the products DataFrame using 'asin' as the key
    user_reviews = user_reviews.merge(products_df, on='asin')

    return user_reviews[['asin', 'product_name', 'review']]

# Example usage
user_id = '1234'
user_reviews = get_user_reviews(user_id)
print(f"User {user_id} has reviewed the following products:")
print(user_reviews)

FileNotFoundError: [Errno 2] No such file or directory: 'user_reviews.csv'

In [None]:
#!pip install textblob

In [4]:
import re
import string
from bs4 import BeautifulSoup
from unidecode import unidecode
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self._preprocess(text) for text in X]

    def _preprocess(self, text):
        # Lowercasing
        text = text.lower()
        # Remove accented characters
        text = unidecode(text)
        
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        
        # remove punctuation
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
        
        # remove double spaces
        text = re.sub(' +', ' ', text)
        
        # Tokenize text
        words = word_tokenize(text)
        # Remove stopwords and lemmatize
        words = [self.lemmatizer.lemmatize(
            word) for word in words if word not in self.stop_words]
        
        return ' '.join(words)

In [10]:
# 5 clusters built with the text features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

preprocessor = Pipeline([
  ('text_preprocessor', TextPreprocessor()),
  ('vectorizer', TfidfVectorizer()),
])

# Convert text features to numerical representations using TF-IDF vectorization
df_users = df
#clean rows where reviewText is NaN
df_users = df_users.dropna(subset=['reviewText'])

X = preprocessor.fit_transform(df_users['reviewText'])

print(X.shape)

# Calculate cosine similarity matrix
#cosine_sim = cosine_similarity(X)

# Apply K-means clustering
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

# Add cluster labels to the DataFrame
df_users['cluster'] = clusters

# Print the cluster assignments


(800089, 155949)


  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users['cluster'] = clusters


In [11]:
print(df_users[['reviewerName', 'cluster', 'reviewText']])

             reviewerName  cluster  \
0             cotton clay        2   
1                emankcin        2   
2           Starbucks Fan        2   
3        Caitlyn Jacobson        2   
4                E. Ervin        2   
...                   ...      ...   
800352               Anky        2   
800353                 DM        0   
800354  Verdant Treasures        2   
800355              C. F.        0   
800356     Tegan M. Reyes        2   

                                               reviewText  
0       kids like story BUT while i really wanted a bo...  
1       Bought this used and it came in great conditio...  
2       Every story and book about Corduroy is Fantast...  
3       I purchased this book for my first grade class...  
4       Having spent numerous years in an elementary s...  
...                                                   ...  
800352              Delivered on time and is as expected.  
800353                                      worked great.  
800

In [13]:
#show examples of each cluster
for i in range(k):
    print(f"Cluster {i} example:")
    print(df_users[df_users['cluster'] == i].head(5)[['reviewerName', 'cluster', 'reviewText']])
    print('\n')

Cluster 0 example:
        reviewerName  cluster                    reviewText
44   Jason W. Bishop        0                Great product.
138          Sue Kim        0              great condition.
159  Amazon Customer        0                 great product
176       T. Darling        0                        Great!
188         lawrence        0  Great product for the price.


Cluster 1 example:
       reviewerName  cluster                                 reviewText
29       BRANDON K.        1  Good product, does what it's supposed to.
115  rose ann ramos        1                               good product
136       Yesha Luo        1                                       good
153   terry lasyone        1                               Good product
192          Rafael        1                                       good


Cluster 2 example:
       reviewerName  cluster  \
0       cotton clay        2   
1          emankcin        2   
2     Starbucks Fan        2   
3  Caitlyn Jacobson

In [22]:
#!pip install transformers

In [5]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
import pandas as pd


In [34]:
from tqdm import tqdm
import torch
from transformers import DistilBertTokenizer, DistilBertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

def get_bert_embeddings(sentences, batch_size=32):
    model.eval()
    embeddings = []
    progress_bar = tqdm(range(0, len(sentences), batch_size), desc="Generating Embeddings")

    with torch.no_grad():
        for i in progress_bar:
            batch_sentences = sentences[i:i+batch_size]
            inputs = tokenizer(batch_sentences, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :]  # Use the embedding of the [CLS] token
            embeddings.append(batch_embeddings.cpu())  # Move embeddings back to CPU if needed

    return torch.cat(embeddings, dim=0)



In [40]:
# Convert the DataFrame column to a list of reviews
# Ensure all reviews are strings; this also handles NaN values gracefully
df_reviews['summary'] = df_reviews['summary'].dropna().astype(str)
#turn all elements to string
df_reviews['summary'] = df_reviews['summary'].astype(str)


#reviews = 1% of reviews
reviews = df_reviews['summary'][:int(len(df_reviews['summary'])* 0.1)]

#print(reviews)


preprocessor = TextPreprocessor()
preprocessed_reviews =  preprocessor.fit_transform(reviews)

#print(preprocessed_reviews)
print(len(preprocessed_reviews))
# Generate BERT embeddings

#show progress bar

embeddings = get_bert_embeddings(preprocessed_reviews)
print(embeddings)

80035


Generating Embeddings: 100%|██████████| 2502/2502 [12:35<00:00,  3.31it/s]  


tensor([[-0.4292, -0.3136,  0.0209,  ..., -0.0545,  0.2543,  0.1028],
        [-0.1332, -0.0620,  0.0943,  ..., -0.0172,  0.2085,  0.1851],
        [-0.3114, -0.1208,  0.0139,  ..., -0.1258,  0.2320,  0.2977],
        ...,
        [-0.2222,  0.1290, -0.1111,  ..., -0.0086,  0.4601,  0.3364],
        [ 0.0441, -0.0720,  0.0839,  ..., -0.0750,  0.1904,  0.0642],
        [-0.3814, -0.2903,  0.0730,  ..., -0.1287, -0.0366,  0.1214]])


In [78]:
# make a new dataframe where description column is a string instead of a list
#remove first and last character of the string
df_products_filtered['description'] = df_products_filtered['description'].astype(str)
df_products_filtered['description'] = df_products_filtered['description'].str[1:-1]
df_products_filtered['description'].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_products_filtered['description'] = df_products_filtered['description'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_products_filtered['description'] = df_products_filtered['description'].str[1:-1]


0     'Sequential Spelling is based on the classic O...
2     'Pearson MyHistoryLab Online Access Code for A...
3     'Corduroy the bear goes to the launderette wit...
5     'A good helper to help you record your reading...
23    'Unique DesignNovel Appearance<br> Electroplat...
Name: description, dtype: object

In [80]:
prod_descriptions = df_products_filtered['description'][:int(len(df_products_filtered['description'])* 0.1)]

preprocessor = TextPreprocessor()
preprocessed_descriptions =  preprocessor.fit_transform(prod_descriptions)

# Generate BERT embeddings
embeddings = get_bert_embeddings(preprocessed_descriptions)
print(embeddings)

Generating Embeddings: 100%|██████████| 7/7 [00:12<00:00,  1.80s/it]

tensor([[-0.3614,  0.0404, -0.0675,  ...,  0.0223,  0.1958,  0.1585],
        [-0.3093,  0.0895, -0.2680,  ..., -0.2148,  0.3053,  0.5857],
        [-0.4199, -0.2032, -0.0889,  ..., -0.3773,  0.1998, -0.0556],
        ...,
        [-0.3254, -0.2669,  0.2629,  ..., -0.2515,  0.1685, -0.0035],
        [-0.2522, -0.0139, -0.0895,  ..., -0.2284,  0.2832,  0.2796],
        [-0.6760, -0.2615,  0.1202,  ..., -0.0768, -0.1347,  0.5695]])





In [None]:
# Convert embeddings tensor to numpy array
X = embeddings.numpy()

# Apply K-means clustering
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(X)

# Add cluster labels to the DataFrame
df_users['cluster'] = clusters
