In [1]:
import pandas as pd
import numpy as np
import collections

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import scipy
import math
import random
import sklearn
import string

reallyfinalbooks = pd.read_csv('reallyfinalbooks.csv')
reallyfinalratings = pd.read_csv('reallyfinalratings.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

traintable = pd.pivot_table(data=train, index="newuser_id", columns="newbook_id", values="rating").sort_index(axis=0).sort_index(axis=1)
testtable = pd.pivot_table(data=test, index="newuser_id", columns="newbook_id", values="rating").sort_index(axis=0).sort_index(axis=1)

In [2]:
traintable

newbook_id,1,2,3,4,5,6,7,8,9,10,...,7991,7992,7993,7994,7995,7996,7997,7998,7999,8000
newuser_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,4.0,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,5.0,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,4.0,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,


In [3]:
(len(reallyfinalratings["newuser_id"].unique()),
 len(train["newuser_id"].unique()),
 len(test["newuser_id"].unique()),
 len(reallyfinalratings["newbook_id"].unique()),
 len(train["newbook_id"].unique()),
 len(test["newbook_id"].unique()),
)

(11219, 11219, 11219, 8000, 8000, 8000)

In [4]:
def rmse(y,h):
    """RMSE
    Args:
        y: real_table y
        h: predicted_table h
    Returns:
        RMSE
    """
    a = y-h
    a = a.reshape(a.size)
    a = a[~np.isnan(a)]

    return np.sqrt(sum(a**2)/len(a))

In [5]:
def dcg_k(r, k):
    """ Discounted Cumulative Gain (DGC)  
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        DCG
    """
  
    r = np.asfarray(r)[:k]
    return np.sum(2**r / np.log2(np.arange(2, r.size + 2)))      



def ndcg_k(r, k=20):
    """Normalized Discounted Cumulative Gain (NDCG)
    Args:
        r: True Ratings in Predicted Rank Order (1st element is top recommendation)
        k: Number of results to consider
    Returns:
        NDCG
    """
    dcg_max = dcg_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_k(r, k) / dcg_max

# def mean_ndcg(rs):
#     """Mean NDCG for all users
#     Args:
#         rs: Iterator / For each user: True Ratings in Predicted Rank Order
#     Returns:
#         Mean NDCG
#     """
#     return np.mean([ndcg_k(r, len(r)) for r in rs])

In [6]:
def divSco_k(r, tail, k=10):
    """Diversity Score
    Args:
        r: bookids in Predicted Rank Order (1st element is top recommendation)
        tail: list of less popular/less known books
        k: Number of results to consider
    Returns:
        Diversity Score
    """
    count = 0
    for bookid in r[:k]:
        if bookid in tail:
            count += 1
    return count / k

In [7]:
tailcomp = reallyfinalratings[["newbook_id", "rating"]].groupby("newbook_id").agg(len).rename(columns={"rating":"count"}).sort_values(by='count', ascending=False).reset_index()
tot = sum(tailcomp['count'])
tailcomp['popshare']= [x/tot for x in tailcomp['count']]
tailcomp['popsharecumsum']= tailcomp['popshare'].cumsum()
tailcomp['category']= ['Head' if x<0.95 else "Tail" for x in tailcomp['popsharecumsum']]
tail = tailcomp[tailcomp['category'] == 'Tail']

In [8]:
def aggfunc(s):
    if s.name == "count":
        return len(s)
    elif s.name == "avg_rating":
        return np.mean(s)
populartmp = train.drop(columns=["newuser_id"]).rename(columns={"rating":"avg_rating"}).assign(count = np.ones(len(train))).astype({"count":"int64"})
popular = populartmp.groupby("newbook_id").agg(aggfunc).sort_values(by="count", ascending=False).reset_index()
popular

Unnamed: 0,newbook_id,avg_rating,count
0,1780,4.414065,2659
1,6766,4.368724,2609
2,473,3.908739,2323
3,5115,4.311528,2186
4,6742,3.574783,1959
5,7521,3.880492,1707
6,6743,4.117891,1688
7,6592,4.250460,1629
8,5699,3.555767,1578
9,2299,4.374273,1547


In [9]:
#pridected ratings for all users
PRFAU = popular.rename(columns={"avg_rating": "pred_rating"}).drop(columns="count").assign(rank=np.arange(1, len(popular)+1)).sort_values(by="newbook_id").set_index("newbook_id")
PRFAU

Unnamed: 0_level_0,pred_rating,rank
newbook_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.085470,3641
2,3.730769,6456
3,3.855072,7492
4,3.371429,4269
5,3.810811,2557
6,3.908333,3507
7,4.432432,6952
8,4.135135,6912
9,3.724138,5564
10,3.893300,405


In [10]:
userids = reallyfinalratings["newuser_id"].unique()
preds = {}
for userid in userids:
    preds[userid] = PRFAU

userids = reallyfinalratings["newuser_id"].unique()
pred_ratings = {}
for userid in userids:
    pred_ratings[userid] = PRFAU["pred_rating"].values

In [11]:
%%time
pred_table = pd.DataFrame.from_dict(pred_ratings, orient="index", columns=range(1, len(reallyfinalbooks)+1))
pred_table = pred_table.sort_index(axis=0).sort_index(axis=1)

Wall time: 26.5 s


In [12]:
pred_table

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,7991,7992,7993,7994,7995,7996,7997,7998,7999,8000
1,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
2,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
3,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
4,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
5,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
7,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
8,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
9,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
10,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636
11,3.08547,3.730769,3.855072,3.371429,3.810811,3.908333,4.432432,4.135135,3.724138,3.8933,...,4.342466,3.843434,3.657534,4.056075,3.927438,4.17284,3.53,4.303797,4.345794,4.476636


In [13]:
%%time
train_rmse = rmse(traintable.values, pred_table.values)
test_rmse = rmse(testtable.values, pred_table.values)

Wall time: 3.78 s


In [14]:
%%time
userids = reallyfinalratings["newuser_id"].unique()
train_rs = []
test_rs = []
for userid in userids:
    train_rs.append(train[train["newuser_id"] == userid].merge(preds[userid], on="newbook_id", how="left").sort_values(by="rank")["rating"])
    test_rs.append(test[test["newuser_id"] == userid].merge(preds[userid], on="newbook_id", how="left").sort_values(by="rank")["rating"])

train_ndgc = np.mean([ndcg_k(r) for r in train_rs])
test_ndgc = np.mean([ndcg_k(r) for r in test_rs])

Wall time: 1min 23s


In [15]:
%%time
diversityScore = np.mean([divSco_k(pred.sort_values(by="rank").index,tail["newbook_id"].values) for pred in list(preds.values())])

Wall time: 11.2 s


In [16]:
print("Popularity Model")
print("RMSE for train data: {:.3f}, test data: {:.3f}".format(train_rmse, test_rmse))
print("nDGC for train data: {:.3f}, test data: {:.3f}".format(train_ndgc, test_ndgc))
print("Diversity Score: {:.3f}".format(diversityScore))

Popularity Model
RMSE for train data: 0.945, test data: 0.950
nDGC for train data: 0.662, test data: 0.795
Diversity Score: 0.000
