In [1]:
import pandas as pd
import numpy as np
import collections

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import scipy
import math
import random
import sklearn
import string

import nltk
from nltk.corpus import stopwords #Manually download
#nltk.download("stopwords")

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

MIN_WORD_COUNT = 25
#MODEL = "GaussianNB"
MODEL = "MultinomialNB"

reallyfinalbooks = pd.read_csv('reallyfinalbooks.csv')
reallyfinalratings = pd.read_csv('reallyfinalratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

traintable = pd.pivot_table(data=train, index="newuser_id", columns="newbook_id", values="rating").sort_index(axis=0).sort_index(axis=1)
testtable = pd.pivot_table(data=test, index="newuser_id", columns="newbook_id", values="rating").sort_index(axis=0).sort_index(axis=1)

In [2]:
def rmse(y,h):
    """RMSE
    Args:
        y: real_table y
        h: predicted_table h
    Returns:
        RMSE
    """
    a = y-h
    a = a.reshape(a.size)
    a = a[~np.isnan(a)]

    return np.sqrt(sum(a**2)/len(a))

def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2)))      



def ndcg_k(r, k=20):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

def divSco_k(r, tail, k=10):
    """Diversity Score
    Args:
        r: bookids in Predicted Rank Order (1st element is top recommendation)
        tail: list of less popular/less known books
        k: Number of results to consider
    Returns:
        Diversity Score
    """
    count = 0
    for bookid in r[:k]:
        if bookid in tail:
            count += 1
    return count / k

tailcomp = reallyfinalratings[["newbook_id", "rating"]].groupby("newbook_id").agg(len).rename(columns={"rating":"count"}).sort_values(by='count', ascending=False).reset_index()
tot = sum(tailcomp['count'])
tailcomp['popshare']= [x/tot for x in tailcomp['count']]
tailcomp['popsharecumsum']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popsharecumsum']]
tail = tailcomp[tailcomp['category'] == 'Tail']

In [3]:
reallyfinalbooks["description_list"] = (reallyfinalbooks["description"]
                                        .replace("Unknown", np.NaN)
                                        .fillna(reallyfinalbooks["title_without_series"])
                                        .str.replace(r'[^\w\s\']',"")
                                        .str.lower()
                                        .str.split())

In [4]:
allwords = []
for l in reallyfinalbooks["description_list"]:
    allwords += l
wordcounts = pd.DataFrame({"count": allwords})["count"].value_counts().to_frame().sort_values(by="count", ascending=False)

In [5]:
wordbag = []
for word in wordcounts.index:
    if wordcounts.loc[word, "count"] >= MIN_WORD_COUNT and word not in stopwords.words("english") and len(word) > 1:
        wordbag.append(word)

In [6]:
%%time
def deslist2desvector(wb, l):
    v = []
    for word in wb:
        count = l.count(word)
        v.append(count)
    return [np.array(v)]

reallyfinalbooks["description_vector"] = reallyfinalbooks["description_list"].apply(lambda l: deslist2desvector(wordbag, l))

Wall time: 1min 4s


In [7]:
%%time
userid2preds = {}
userid2pred_ratings = {}
userids = list(reallyfinalratings["newuser_id"].unique())


for counter, userid in enumerate(userids):
    if (counter % 500 == 0):
        print(counter)
    userid2usertraindata = train[train["newuser_id"] == userid].merge(reallyfinalbooks, on="newbook_id", how="left")
    userclass = np.sort(userid2usertraindata["rating"].unique())
    if len(userclass) == 1:
        userclass = np.append(userclass, [0])
    m = MultinomialNB(alpha = 0.1) if MODEL == "MultinomialNB" else GaussianNB()
    m.fit(np.concatenate(userid2usertraindata["description_vector"]), userid2usertraindata["rating"])
    pred =  (reallyfinalbooks[["newbook_id"]]
            .assign(pred_rating=(m.predict_proba(np.concatenate(reallyfinalbooks["description_vector"])) @ userclass))
            .sort_values(by="pred_rating", ascending=False)
            .assign(rank=np.arange(1, len(reallyfinalbooks)+1))
            .sort_values(by="newbook_id")
            .set_index("newbook_id"))
    userid2preds[userid] = pred
    userid2pred_ratings[userid] = pred["pred_rating"]

0
500
1000
1500
2000
2500


  self.class_log_prior_ = (np.log(self.class_count_) -


3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
Wall time: 1h 41min 58s


In [8]:
%%time
pred_table = pd.DataFrame.from_dict(userid2pred_ratings, orient="index", columns=range(1, len(reallyfinalbooks)+1))
pred_table = pred_table.sort_index(axis=0).sort_index(axis=1)

Wall time: 4min 21s


In [19]:
%%time
train_rmse = rmse(traintable.values, pred_table.values)
test_rmse = rmse(testtable.values, pred_table.values)

Wall time: 3.71 s


In [15]:
%%time
userids = reallyfinalratings["newuser_id"].unique()
train_rs = []
test_rs = []
for userid in userids:
    train_rs.append(train[train["newuser_id"] == userid].merge(userid2preds[userid], on="newbook_id", how="left").sort_values(by="rank")["rating"])
    test_rs.append(test[test["newuser_id"] == userid].merge(userid2preds[userid], on="newbook_id", how="left").sort_values(by="rank")["rating"])

train_ndgc = np.mean([ndcg_k(r) for r in train_rs])
test_ndgc = np.mean([ndcg_k(r) for r in test_rs])

Wall time: 2min 42s


In [17]:
%%time
diversityScore = np.mean([divSco_k(pred.sort_values(by="rank").index,tail["newbook_id"].values) for pred in list(userid2preds.values())])

Wall time: 23.8 s


In [20]:
print("NB Model")
print("RMSE for train data: {:.3f}, test data: {:.3f}".format(train_rmse, test_rmse))
print("nDGC for train data: {:.3f}, test data: {:.3f}".format(train_ndgc, test_ndgc))
print("Diversity Score: {:.3f}".format(diversityScore))

NB Model
RMSE for train data: 0.057, test data: 1.194
nDGC for train data: 1.000, test data: 0.807
Diversity Score: 0.092
