In [1]:
import pandas as pd
import numpy as np
import collections

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import scipy
import math
import random
import sklearn
import string

import nltk
from nltk.corpus import stopwords #Manually download
#nltk.download("stopwords")

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

from util import rmse
from util import ndcg_k
from util import divSco_k
from util import gettail

MIN_WORD_COUNT = 15
# #MODEL = "GaussianNB"
# MODEL = "MultinomialNB"

reallyfinalbooks = pd.read_csv('reallyfinalbooks.csv')
reallyfinalratings = pd.read_csv('reallyfinalratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

traintable = pd.pivot_table(data=train, index="newuser_id", columns="newbook_id", values="rating").sort_index(axis=0).sort_index(axis=1)
testtable = pd.pivot_table(data=test, index="newuser_id", columns="newbook_id", values="rating").sort_index(axis=0).sort_index(axis=1)

tail = gettail(reallyfinalratings)

In [2]:
reallyfinalbooks["description_list"] = (reallyfinalbooks["description"]
                                        .replace("Unknown", np.NaN)
                                        .fillna(reallyfinalbooks["title_without_series"])
                                        .str.replace(r'[^\w\s\']',"")
                                        .str.lower()
                                        .str.split())
reallyfinalbooks["description_list"] = (reallyfinalbooks["description_list"]
                                        + (reallyfinalbooks["genre"]
                                           .str.replace("|", " ")
                                           .str.replace("-", " ")
                                           .str.split()))

In [3]:
allwords = []
for l in reallyfinalbooks["description_list"]:
    allwords += list(set(l))
wordcounts = pd.DataFrame({"count": allwords})["count"].value_counts().to_frame().sort_values(by="count", ascending=False)

In [4]:
wordbag = []
for word in wordcounts.index:
    if wordcounts.loc[word, "count"] >= MIN_WORD_COUNT and word not in stopwords.words("english") and len(word) > 1:
        wordbag.append(word)

In [5]:
%%time
iDFs = []
for word in wordbag:
    iDF = np.log(len(reallyfinalbooks) / sum(reallyfinalbooks["description_list"].apply(lambda l: 1 if word in l else 0)))
    iDFs.append(iDF)
word_iDFs = pd.DataFrame({"word":wordbag, "iDF":iDFs}).set_index("word").sort_values(by="iDF")
word_iDFs

Wall time: 1min 44s


In [6]:
%%time
def getTF_iDF(descriptionlist, word_iDF):
    wordlist = list(filter(lambda x: x in word_iDFs.index, descriptionlist))
    wordcounter = collections.Counter(wordlist)
    TF_iDF_vector = list(map(lambda x: wordcounter[x] * word_iDFs.loc[x, "iDF"] / len(wordlist), word_iDFs.index))
    return TF_iDF_vector

reallyfinalbooks["TF-iDF_vector"] = reallyfinalbooks["description_list"].apply(lambda x: getTF_iDF(x, word_iDFs))

Wall time: 6min 1s


In [7]:
%%time
def getCosine_Similarity(v1, v2):
    numerator = v1 @ v2
    denominator = np.sqrt(np.sum(np.square(v1), axis=-1)) * np.sqrt(np.sum(np.square(v2), axis=-1))
    return numerator / denominator

v1 = np.stack(reallyfinalbooks["TF-iDF_vector"])
bookid2similarities = {}

for bookid, v2 in zip(reallyfinalbooks["newbook_id"], reallyfinalbooks["TF-iDF_vector"]):
    bookid2similarities[bookid] = getCosine_Similarity(v1, v2)

Wall time: 34min 11s


In [8]:
%%time
similarity_table = pd.DataFrame.from_dict(bookid2similarities, orient="index", columns=reallyfinalbooks["newbook_id"])
bookid2similarities = None

Wall time: 17.7 s


In [9]:
class TF_iDFModel:
    
    def __init__(self, similarity_table):
        self.similarity_table = similarity_table
    
    def fit(self, X, y):
        """  
        Args:
            X: A list of training data bookids
            y: A list of training data ratings
        """
        self.traindatasimilarity = self.similarity_table.loc[X,:]
        self.traindatarating = np.array(y)
    
    def predict(self, X):
        """  
        Args:
            X: A list of bookids waiting to be predicted
        Returns:
            Predicted ratings
        """
        predictdatasimilarity = self.traindatasimilarity.loc[:, X]
        total_weights = np.sum(predictdatasimilarity, axis=0)
        return (self.traindatarating @ predictdatasimilarity) / total_weights

In [10]:
%%time
userid2preds = {}
userid2pred_ratings = {}
userids = list(reallyfinalratings["newuser_id"].unique())


for counter, userid in enumerate(userids):
    if (counter % 500 == 0):
        print(counter)
    userid2usertraindata = train[train["newuser_id"] == userid]
    m = TF_iDFModel(similarity_table)
    m.fit(userid2usertraindata["newbook_id"], userid2usertraindata["rating"])
    pred =  (reallyfinalbooks[["newbook_id"]]
            .assign(pred_rating=m.predict(reallyfinalbooks["newbook_id"]))
            .sort_values(by="pred_rating", ascending=False)
            .assign(rank=np.arange(1, len(reallyfinalbooks)+1))
            .sort_values(by="newbook_id")
            .set_index("newbook_id"))
    userid2preds[userid] = pred
    userid2pred_ratings[userid] = pred["pred_rating"]

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
Wall time: 7min 4s


In [11]:
%%time
pred_table = pd.DataFrame.from_dict(userid2pred_ratings, orient="index", columns=range(1, len(reallyfinalbooks)+1))
pred_table = pred_table.sort_index(axis=0).sort_index(axis=1)

Wall time: 3min 17s


In [12]:
%%time
train_rmse = rmse(traintable.values, pred_table.values)
test_rmse = rmse(testtable.values, pred_table.values)

Wall time: 8.5 s


In [13]:
%%time
userids = reallyfinalratings["newuser_id"].unique()
train_rs = []
test_rs = []
for userid in userids:
    train_rs.append(train[train["newuser_id"] == userid].merge(userid2preds[userid], on="newbook_id", how="left").sort_values(by="rank")["rating"])
    test_rs.append(test[test["newuser_id"] == userid].merge(userid2preds[userid], on="newbook_id", how="left").sort_values(by="rank")["rating"])

train_ndgc = np.mean([ndcg_k(r) for r in train_rs])
test_ndgc = np.mean([ndcg_k(r) for r in test_rs])

Wall time: 1min 42s


In [14]:
%%time
diversityScore = np.mean([divSco_k(pred.sort_values(by="rank").index,tail["newbook_id"].values) for pred in list(userid2preds.values())])

Wall time: 13 s


In [15]:
print("NB Model")
print("RMSE for train data: {:.3f}, test data: {:.3f}".format(train_rmse, test_rmse))
print("nDGC for train data: {:.3f}, test data: {:.3f}".format(train_ndgc, test_ndgc))
print("Diversity Score: {:.3f}".format(diversityScore))

NB Model
RMSE for train data: 0.910, test data: 0.917
nDGC for train data: 0.616, test data: 0.690
Diversity Score: 0.130
