In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
%matplotlib inline
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.prediction_algorithms.knns import KNNBasic

import warnings; warnings.simplefilter('ignore')

### Assign the tables: 

In [2]:
games = pd.read_csv('../../../games_detailed_info.csv')
bgg_reviews = pd.read_csv('../../../bgg-15m-reviews.csv')
testezinho = pd.read_csv('../../../2020-08-19.csv')

### Bring useful functions to the data exploration

In [3]:
def basic_info(data):
    """ Gets basic information to understand more of the data, also useful to delete columns that we know are categorical"""
    print("Dataset shape is: ", data.shape)
    print("Dataset size is: ", data.size)
    print(data.duplicated().sum())
    print("Dataset columns are: ", data.columns)
    print("Dataset info is: ", data.info())
    categorical = []
    numerical = []
    for i in data.columns:
        if data[i].dtype == object:
            categorical.append(i)
        else:
            numerical.append(i)
    print("Categorical variables are:\n ", categorical)
    print("Numerical variables are:\n ", numerical)
    return categorical, numerical

In [4]:
def columns_w_nan(data):
    """ Returns the columns that have over 30% of its data as missing value """
    temp = []
    temp_perce = []
    for x in data.columns:
        if data[x].isna().sum() > 0:
            percentage_of_data = data[x].isna().sum()/(list(data.shape)[0])
            print("Row: {} : is made {}% of NaN Values.".format(x, percentage_of_data.round(3)))
            if percentage_of_data > 0.3:
                temp.append(x)
                temp_perce.append(percentage_of_data)
    if len(temp) > 0:
        print('')            
        print(temp)
        print('')
        print(temp_perce)

In [5]:
def columns_repeat(data, data1, data2):
    """" This function has as objective to:
    1 - Find possible columns to merge dataframes
    2 - Find columns that have to be deleted before the merging
    """
    data_columns = []
    data1_columns = []
    data2_columns = []
    data_data1 = []
    data_data2 = []
    data_data3 = []
    for x in data.columns:
        data_columns.append(x)
    for y in data1.columns:
        data1_columns.append(y)
    for z in data2.columns:
        data2_columns.append(z)
     
    for a in data_columns:
        if a in data1_columns:
            data_data1.append(a)
            
    for b in data_columns:
        if b in data2_columns:
            data_data2.append(b)
    
    for c in data1_columns:
        if c in data2_columns:
            data_data3.append(c)
        
    return(data_data1, data_data2, data_data3)

### EDA "games"

In [6]:
basic_info(games)

Dataset shape is:  (19230, 56)
Dataset size is:  1076880
0
Dataset columns are:  Index(['Unnamed: 0', 'type', 'id', 'thumbnail', 'image', 'primary',
       'alternate', 'description', 'yearpublished', 'minplayers', 'maxplayers',
       'suggested_num_players', 'suggested_playerage',
       'suggested_language_dependence', 'playingtime', 'minplaytime',
       'maxplaytime', 'minage', 'boardgamecategory', 'boardgamemechanic',
       'boardgamefamily', 'boardgameexpansion', 'boardgameimplementation',
       'boardgamedesigner', 'boardgameartist', 'boardgamepublisher',
       'usersrated', 'average', 'bayesaverage', 'Board Game Rank',
       'Strategy Game Rank', 'Family Game Rank', 'stddev', 'median', 'owned',
       'trading', 'wanting', 'wishing', 'numcomments', 'numweights',
       'averageweight', 'boardgameintegration', 'boardgamecompilation',
       'Party Game Rank', 'Abstract Game Rank', 'Thematic Rank',
       'War Game Rank', 'Customizable Rank', 'Children's Game Rank',
       '

(['type',
  'thumbnail',
  'image',
  'primary',
  'alternate',
  'description',
  'suggested_num_players',
  'suggested_playerage',
  'suggested_language_dependence',
  'boardgamecategory',
  'boardgamemechanic',
  'boardgamefamily',
  'boardgameexpansion',
  'boardgameimplementation',
  'boardgamedesigner',
  'boardgameartist',
  'boardgamepublisher',
  'boardgameintegration',
  'boardgamecompilation'],
 ['Unnamed: 0',
  'id',
  'yearpublished',
  'minplayers',
  'maxplayers',
  'playingtime',
  'minplaytime',
  'maxplaytime',
  'minage',
  'usersrated',
  'average',
  'bayesaverage',
  'Board Game Rank',
  'Strategy Game Rank',
  'Family Game Rank',
  'stddev',
  'median',
  'owned',
  'trading',
  'wanting',
  'wishing',
  'numcomments',
  'numweights',
  'averageweight',
  'Party Game Rank',
  'Abstract Game Rank',
  'Thematic Rank',
  'War Game Rank',
  'Customizable Rank',
  "Children's Game Rank",
  'RPG Item Rank',
  'Accessory Rank',
  'Video Game Rank',
  'Amiga Rank',
  'Co

In [7]:
columns_w_nan(games)

Row: thumbnail : is made 0.001% of NaN Values.
Row: image : is made 0.001% of NaN Values.
Row: alternate : is made 0.6% of NaN Values.
Row: description : is made 0.0% of NaN Values.
Row: suggested_playerage : is made 0.122% of NaN Values.
Row: suggested_language_dependence : is made 0.134% of NaN Values.
Row: boardgamecategory : is made 0.011% of NaN Values.
Row: boardgamemechanic : is made 0.081% of NaN Values.
Row: boardgamefamily : is made 0.233% of NaN Values.
Row: boardgameexpansion : is made 0.751% of NaN Values.
Row: boardgameimplementation : is made 0.787% of NaN Values.
Row: boardgamedesigner : is made 0.022% of NaN Values.
Row: boardgameartist : is made 0.281% of NaN Values.
Row: Strategy Game Rank : is made 0.892% of NaN Values.
Row: Family Game Rank : is made 0.896% of NaN Values.
Row: boardgameintegration : is made 0.926% of NaN Values.
Row: boardgamecompilation : is made 0.964% of NaN Values.
Row: Party Game Rank : is made 0.97% of NaN Values.
Row: Abstract Game Rank : is

In [8]:
# Dropping the columns that have over 30% of its data missing 
# (actually we aim to 30, but the one with least is 60% getting to 100% in this list)
games1 = games.drop(['alternate', 'boardgameexpansion', 'boardgameimplementation', 
                     'Strategy Game Rank', 'Family Game Rank', 'boardgameintegration', 
                     'boardgamecompilation', 'Party Game Rank', 'Abstract Game Rank',
                     'Thematic Rank', 'War Game Rank', 'Customizable Rank', "Children's Game Rank", 
                     'RPG Item Rank', 'Accessory Rank', 'Video Game Rank', 'Amiga Rank', 'Commodore 64 Rank',
                     'Arcade Rank', 'Atari ST Rank', 'Unnamed: 0'], axis=1)

In [9]:
games1.head(3)

Unnamed: 0,type,id,thumbnail,image,primary,description,yearpublished,minplayers,maxplayers,suggested_num_players,...,Board Game Rank,stddev,median,owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,boardgame,30549,https://cf.geekdo-images.com/thumb/img/HEKrtpT...,https://cf.geekdo-images.com/original/img/j-pf...,Pandemic,"In Pandemic, several virulent diseases have br...",2008,2,4,"[OrderedDict([('@numplayers', '1'), ('result',...",...,91,1.32632,0,144727,2191,640,8571,15778,5232,2.4148
1,boardgame,822,https://cf.geekdo-images.com/thumb/img/kqE4YJS...,https://cf.geekdo-images.com/original/img/o4p6...,Carcassonne,Carcassonne is a tile-placement game in which ...,2000,2,5,"[OrderedDict([('@numplayers', '1'), ('result',...",...,173,1.30369,0,140066,1587,539,6286,17720,7304,1.9158
2,boardgame,13,https://cf.geekdo-images.com/thumb/img/g8LvJsd...,https://cf.geekdo-images.com/original/img/A-0y...,Catan,"In Catan (formerly The Settlers of Catan), pla...",1995,3,4,"[OrderedDict([('@numplayers', '1'), ('result',...",...,381,1.47908,0,144656,1825,474,5310,17859,7187,2.3264


In [10]:
# We run the function again to a much cleaner output
columns_w_nan(games1)

Row: thumbnail : is made 0.001% of NaN Values.
Row: image : is made 0.001% of NaN Values.
Row: description : is made 0.0% of NaN Values.
Row: suggested_playerage : is made 0.122% of NaN Values.
Row: suggested_language_dependence : is made 0.134% of NaN Values.
Row: boardgamecategory : is made 0.011% of NaN Values.
Row: boardgamemechanic : is made 0.081% of NaN Values.
Row: boardgamefamily : is made 0.233% of NaN Values.
Row: boardgamedesigner : is made 0.022% of NaN Values.
Row: boardgameartist : is made 0.281% of NaN Values.


In [None]:
# The ones here with the biggest number are boardgameartist and boardgamefamily, we are going to keep those for now

# The most interesting columns here would be boardgamedesigner, boardgamecategory, suggested_playerage and averageweight
# Also look at minplayers-maxplayers

In [125]:
# Create a function to get games by specific designer

In [126]:
df_by_designer = games1[games1.boardgamedesigner == "['Matt Leacock']"]

In [135]:
list(df_by_designer['primary'].values)

['Pandemic',
 'Forbidden Island',
 'Forbidden Desert',
 'Roll Through the Ages: The Bronze Age',
 'Pandemic: The Cure',
 'Forbidden Sky',
 'Thunderbirds',
 'Era: Medieval Age',
 'Knit Wit',
 'Chariot Race',
 'Mole Rats in Space',
 'Pandemic: Hot Zone – North America',
 'Lunatix Loop']

### EDA "bgg_reviews"

In [12]:
basic_info(bgg_reviews)

Dataset shape is:  (15823269, 6)
Dataset size is:  94939614
0
Dataset columns are:  Index(['Unnamed: 0', 'user', 'rating', 'comment', 'ID', 'name'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15823269 entries, 0 to 15823268
Data columns (total 6 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   user        object 
 2   rating      float64
 3   comment     object 
 4   ID          int64  
 5   name        object 
dtypes: float64(1), int64(2), object(3)
memory usage: 724.3+ MB
Dataset info is:  None
Categorical variables are:
  ['user', 'comment', 'name']
Numerical variables are:
  ['Unnamed: 0', 'rating', 'ID']


(['user', 'comment', 'name'], ['Unnamed: 0', 'rating', 'ID'])

In [13]:
columns_w_nan(bgg_reviews)

Row: user : is made 0.0% of NaN Values.
Row: comment : is made 0.811% of NaN Values.

['comment']

[0.8107203385090653]


In [14]:
bgg_reviews.tail(3)

Unnamed: 0.1,Unnamed: 0,user,rating,comment,ID,name
15823266,15823266,BunkerBill,7.0,,281515,Company of Heroes
15823267,15823267,Hattori Hanzo,6.0,,281515,Company of Heroes
15823268,15823268,Richie2000,1.0,,281515,Company of Heroes


In [15]:
# Dropping the columns
bgg_reviews = bgg_reviews.drop(columns=['Unnamed: 0'])

In [16]:
bgg_reviews.head(3)

Unnamed: 0,user,rating,comment,ID,name
0,Torsten,10.0,,30549,Pandemic
1,mitnachtKAUBO-I,10.0,Hands down my favorite new game of BGG CON 200...,30549,Pandemic
2,avlawn,10.0,I tend to either love or easily tire of co-op ...,30549,Pandemic


In [17]:
bgg_reviews['rating'].value_counts()

7.00000    3566154
8.00000    3002815
6.00000    2346047
9.00000    1424911
5.00000    1077762
            ...   
3.53562          1
4.43100          1
6.89150          1
1.71687          1
7.29160          1
Name: rating, Length: 10172, dtype: int64

In [18]:
a = bgg_reviews['name'].value_counts()

In [19]:
a

Pandemic                                        96234
Carcassonne                                     96221
Catan                                           96185
7 Wonders                                       79870
Dominion                                        74954
                                                ...  
Diary of a Wimpy Kid: Zoo-Wee Mama Card Game       30
Revolución                                         30
Ophiuchus: The Thirteenth Constellation            30
Crazier Eights: Camelot                            30
Taurus                                             30
Name: name, Length: 18984, dtype: int64

In [20]:
a = pd.DataFrame(a)

In [21]:
a.head(3)

Unnamed: 0,name
Pandemic,96234
Carcassonne,96221
Catan,96185


In [22]:
a.columns = ['n_reviews']

In [23]:
a.value_counts()
# Investigate high number of low number of reviews
## Small games?
## Glitch?

n_reviews
30           276
31           261
32           248
33           238
34           234
            ... 
2350           1
2349           1
2347           1
2346           1
96234          1
Length: 2858, dtype: int64

### EDA "testezinho"

In [24]:
basic_info(testezinho)

Dataset shape is:  (19330, 10)
Dataset size is:  193300
0
Dataset columns are:  Index(['Unnamed: 0', 'ID', 'Name', 'Year', 'Rank', 'Average', 'Bayes average',
       'Users rated', 'URL', 'Thumbnail'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19330 entries, 0 to 19329
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     19330 non-null  int64  
 1   ID             19330 non-null  int64  
 2   Name           19330 non-null  object 
 3   Year           19330 non-null  int64  
 4   Rank           19330 non-null  int64  
 5   Average        19330 non-null  float64
 6   Bayes average  19330 non-null  float64
 7   Users rated    19330 non-null  int64  
 8   URL            19330 non-null  object 
 9   Thumbnail      19317 non-null  object 
dtypes: float64(2), int64(5), object(3)
memory usage: 1.5+ MB
Dataset info is:  None
Categorical variables are:
  ['Name', 'URL', 'Thumbnai

(['Name', 'URL', 'Thumbnail'],
 ['Unnamed: 0',
  'ID',
  'Year',
  'Rank',
  'Average',
  'Bayes average',
  'Users rated'])

In [25]:
columns_w_nan(testezinho)

Row: Thumbnail : is made 0.001% of NaN Values.


In [26]:
testezinho.head(3)

Unnamed: 0.1,Unnamed: 0,ID,Name,Year,Rank,Average,Bayes average,Users rated,URL,Thumbnail
0,90,30549,Pandemic,2008,91,7.62,7.518,96186,/boardgame/30549/pandemic,https://cf.geekdo-images.com/micro/img/0m3-oqB...
1,172,822,Carcassonne,2000,173,7.42,7.311,96181,/boardgame/822/carcassonne,https://cf.geekdo-images.com/micro/img/z0tTaij...
2,380,13,Catan,1995,381,7.16,7.001,96171,/boardgame/13/catan,https://cf.geekdo-images.com/micro/img/e0y6Bog...


In [27]:
testezinho = testezinho.drop(columns=['Unnamed: 0', 'Thumbnail'])

### Our final situation for the datasets:

In [28]:
testezinho.head(2)

Unnamed: 0,ID,Name,Year,Rank,Average,Bayes average,Users rated,URL
0,30549,Pandemic,2008,91,7.62,7.518,96186,/boardgame/30549/pandemic
1,822,Carcassonne,2000,173,7.42,7.311,96181,/boardgame/822/carcassonne


In [29]:
bgg_reviews.head(2)

Unnamed: 0,user,rating,comment,ID,name
0,Torsten,10.0,,30549,Pandemic
1,mitnachtKAUBO-I,10.0,Hands down my favorite new game of BGG CON 200...,30549,Pandemic


In [30]:
games1.head(2)

Unnamed: 0,type,id,thumbnail,image,primary,description,yearpublished,minplayers,maxplayers,suggested_num_players,...,Board Game Rank,stddev,median,owned,trading,wanting,wishing,numcomments,numweights,averageweight
0,boardgame,30549,https://cf.geekdo-images.com/thumb/img/HEKrtpT...,https://cf.geekdo-images.com/original/img/j-pf...,Pandemic,"In Pandemic, several virulent diseases have br...",2008,2,4,"[OrderedDict([('@numplayers', '1'), ('result',...",...,91,1.32632,0,144727,2191,640,8571,15778,5232,2.4148
1,boardgame,822,https://cf.geekdo-images.com/thumb/img/kqE4YJS...,https://cf.geekdo-images.com/original/img/o4p6...,Carcassonne,Carcassonne is a tile-placement game in which ...,2000,2,5,"[OrderedDict([('@numplayers', '1'), ('result',...",...,173,1.30369,0,140066,1587,539,6286,17720,7304,1.9158


In [31]:
# Fazer uma tabela com pesos baseados em Owned, trading e etc. Por exemplo, Owned significa que a pessoa ainda tem e 
# quer mostrar que tem, entao +0.3, trading significa que o jogo talvez nao tenha tanta replayability/showcase, entao seria -0.5

Ideas:

I use the TMDB Ratings to come up with our Top Movies Chart. I will use IMDB's weighted rating formula to construct my chart. Mathematically, it is represented as follows:

Weighted Rating (WR) =  (vv+m.R)+(mv+m.C) 
where,

v is the number of votes for the movie
m is the minimum votes required to be listed in the chart
R is the average rating of the movie
C is the mean vote across the whole report
The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. We will use 95th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 95% of the movies in the list.

I will build our overall Top 250 Chart and will define a function to build charts for a particular genre. Let's begin!

- Make 3 types of recommenders
- Decide which one is the best
- Build the same recommenders with weighted rating
- Make a function that get more than one movie


Level-ups:

- Flask deployment
- Sentiment Analysis / NLP on the Manual/Review?
- Feedback system 
- PCA?


LAS or Surprise

- Find a way to replace names for ID's

### Making the rec system

In [32]:
reviews_new = bgg_reviews.sort_values(by=['user'])

In [33]:
reviews_new.isna().sum()

user             66
rating            0
comment    12828246
ID                0
name              0
dtype: int64

In [34]:
reviews_new.shape

(15823269, 5)

In [35]:
15823269-66

15823203

In [36]:
reviews_new = reviews_new[0:15823203]

In [37]:
reviews_new.isna().sum()

user              0
rating            0
comment    12828180
ID                0
name              0
dtype: int64

To check reviews based on a user:

In [38]:
reviews_new[reviews_new['user'] == 'Æleksandr Þræð']

Unnamed: 0,user,rating,comment,ID,name
13385410,Æleksandr Þræð,8.0,"Interesting mix of skill, luck and beautiful c...",5451,Hanafuda
13387541,Æleksandr Þræð,8.0,This version of the Chez Geek Empire is a hoot...,9962,Chez Goth
8112769,Æleksandr Þræð,10.0,Excellent game mixing Luck and Strategy with o...,2093,Mahjong
12539839,Æleksandr Þræð,6.0,Excellent ancient game with different strategy...,2932,Hnefatafl
15185475,Æleksandr Þræð,7.0,Fun light harted game of vampire slaying. Tho...,5554,Vampire
9947640,Æleksandr Þræð,9.5,What's better than fighting in large Humanoid ...,1540,BattleTech
3715637,Æleksandr Þræð,8.0,a more 'refined' form of Crazy-8's. Good time...,2223,UNO
13514829,Æleksandr Þræð,7.0,Learned the Game at the Civ. Museum in Ottawa ...,2399,Senet
9162339,Æleksandr Þræð,10.0,Great Late night or Beer & Pretzels game. Ver...,553,Chez Geek
12353561,Æleksandr Þræð,9.0,"A more in depth game than Illuminati, and the ...",1552,Illuminati: New World Order


In [39]:
reviews_new1 = reviews_new.drop(columns=['comment', 'name'])

In [40]:
reviews_new1.head(3)

Unnamed: 0,user,rating,ID
12589584,Fu_Koios,9.0,112092
15241668,Fu_Koios,9.0,223033
5812906,beastvol,7.0,278


In [41]:
### At this point I should set a threshhold for Minimum of contribuitions/reviews

### Transform Users into ID's to respect the user's privacy and to make it easier to "see"

In [42]:
reviews_new1['user_id'] = reviews_new1['user'].astype('category').cat.codes

In [43]:
reviews_new1.head(4)

Unnamed: 0,user,rating,ID,user_id
12589584,Fu_Koios,9.0,112092,0
15241668,Fu_Koios,9.0,223033,0
5812906,beastvol,7.0,278,1
11577025,beastvol,5.0,12004,1


In [44]:
reviews_new2 = reviews_new1.drop(columns='user')

In [45]:
reviews_new2.tail(4)

Unnamed: 0,rating,ID,user_id
13514829,7.0,2399,351047
9162339,10.0,553,351047
12353561,9.0,1552,351047
7487111,7.0,590,351047


In [46]:
reviews_new2.columns = ['rating', 'game_id', 'user_id']

In [47]:
# Drop the users with under X number of reviews?

In [48]:
reader = Reader(rating_scale=(1,10))
svd = SVD()

In [49]:
data = Dataset.load_from_df(reviews_new2, reader)

In [50]:
#cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

In [51]:
type(data)

surprise.dataset.DatasetAutoFolds

### Subset smaller dataframe

In [52]:
reviews_new3 = reviews_new2[0:100000]

In [53]:
reviews_new3.shape

(100000, 3)

In [54]:
reviews_new3.tail(3)

Unnamed: 0,rating,game_id,user_id
12740681,8.0,155255,2437
2857632,7.0,110327,2437
8798057,8.0,152162,2437


In [55]:
reviews_new3.rating.value_counts()

7.00    22278
8.00    18865
6.00    14877
9.00     8961
5.00     7169
        ...  
6.68        1
9.79        1
7.93        1
7.46        1
5.53        1
Name: rating, Length: 543, dtype: int64

In [56]:
reviews_new3.user_id.value_counts()

988     841
139     753
2373    688
845     671
552     655
       ... 
1717      1
1438      1
1822      1
159       1
1887      1
Name: user_id, Length: 2438, dtype: int64

In [57]:
reviews_new3.head(3)

Unnamed: 0,rating,game_id,user_id
12589584,9.0,112092,0
15241668,9.0,223033,0
5812906,7.0,278,1


In [58]:
cols = ['user_id', 'game_id', 'rating']

In [59]:
reviews_new3 = reviews_new3[cols]

In [60]:
reviews_new3.head(3)

Unnamed: 0,user_id,game_id,rating
12589584,0,112092,9.0
15241668,0,223033,9.0
5812906,1,278,7.0


In [61]:
data = Dataset.load_from_df(reviews_new3, reader)

In [62]:
dataset = data.build_full_trainset()

In [63]:
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  2438 

Number of items:  11115


In [64]:
sim_cos = {'name':'cosine', 'user_based':False}

In [66]:
# basic = KNNBasic(sim_options=sim_cos)
# basic.fit(trainset)

In [67]:
#basic.sim

In [68]:
#predictions = basic.test(testset)

In [69]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=6, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 6 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Mean    Std     
RMSE (testset)    1.3751  1.3627  1.3533  1.3706  1.3601  1.3460  1.3613  0.0098  
MAE (testset)     1.0224  1.0122  1.0102  1.0146  1.0093  1.0009  1.0116  0.0064  
Fit time          5.06    4.36    4.27    4.33    4.22    4.27    4.42    0.29    
Test time         0.10    0.08    0.09    0.08    0.14    0.08    0.09    0.02    


{'test_rmse': array([1.3751481 , 1.36272094, 1.35329267, 1.37064317, 1.36012992,
        1.34599578]),
 'test_mae': array([1.02244104, 1.01219852, 1.01016731, 1.01459226, 1.00927821,
        1.0009335 ]),
 'fit_time': (5.057368278503418,
  4.362335443496704,
  4.269955396652222,
  4.32862114906311,
  4.221954584121704,
  4.271955251693726),
 'test_time': (0.09699797630310059,
  0.08192610740661621,
  0.0859987735748291,
  0.08192801475524902,
  0.1360030174255371,
  0.0840001106262207)}

In [70]:
trainset = data.build_full_trainset()

In [71]:
svd.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x206022e2910>

In [72]:
svd.predict('3', '34241241241414', verbose=True)

user: 3          item: 34241241241414 r_ui = None   est = 7.04   {'was_impossible': False}


Prediction(uid='3', iid='34241241241414', r_ui=None, est=7.0350295792, details={'was_impossible': False})

In [73]:
reviews_new3[reviews_new3['user_id'] == 2437]

Unnamed: 0,user_id,game_id,rating
10123139,2437,195544,6.8
2416334,2437,129622,7.2
3172735,2437,3076,7.8
12940741,2437,431,6.5
12849704,2437,173761,7.0
...,...,...,...
4817517,2437,432,7.0
10484189,2437,251658,6.5
12740681,2437,155255,8.0
2857632,2437,110327,7.0


In [74]:
svd.predict('2437', '112092', 3, verbose=True)

user: 2437       item: 112092     r_ui = 3.00   est = 7.04   {'was_impossible': False}


Prediction(uid='2437', iid='112092', r_ui=3, est=7.0350295792, details={'was_impossible': False})

In [75]:
# Trying to make a different rec system

In [76]:
reviews_new4 = reviews_new3

In [77]:
reviews_new4.head(3)

Unnamed: 0,user_id,game_id,rating
12589584,0,112092,9.0
15241668,0,223033,9.0
5812906,1,278,7.0


In [78]:
# Graph the most common rating
# Average number of ratings per user

In [79]:
# Get the names of the top 10 most reviewed game (#1 is pandemic, #2 Catan and so on)
reviews_new4['game_id'].value_counts().head()

30549    657
13       641
822      616
68448    535
36218    504
Name: game_id, dtype: int64

In [80]:
# Users with the most review
# N 1 is the user 988, n2 is 139 and so on
reviews_new4['user_id'].value_counts().head()

988     841
139     753
2373    688
845     671
552     655
Name: user_id, dtype: int64

In [81]:
indices = pd.Series(reviews_new4.index, index=reviews_new4['game_id'])

In [82]:
def hybrid(user_id, game_id):
    idx = indices[game_id]
    tmdbId = reviews_new4.loc[game_id]
    #print(idx)
    movie_id = reviews_new4.loc[game_id]
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    game_indices = [i[0] for i in sim_scores]
    
    bgs = reviews_new4.iloc[game_indices][['game_id', 'rating']]
    bgs['est'] = bgs['game_id'].apply(lambda x: svd.predict(user_id, indices_map.loc[x]['game_id']).est)
    bgs = bgs.sort_values('est', ascending=False)
    return bgs.head(10)

In [83]:
clustering = reviews_new4.sort_values(by='game_id')

In [84]:
clustering

Unnamed: 0,user_id,game_id,rating
8971751,60,1,7.70
8941883,316,1,8.00
8772935,1629,1,9.00
8981678,763,1,7.50
8842517,380,1,8.25
...,...,...,...
14778289,1263,305682,7.50
14778251,1432,305682,8.00
15473712,1507,307305,6.00
15461371,2227,313850,7.00


In [85]:
clustering = clustering.set_index('game_id')

In [86]:
clustering = clustering.pivot_table(values='rating', index=clustering.index, columns='user_id', aggfunc='first')

In [87]:
clustering.isna().sum()

user_id
0       11113
1       11106
2       11101
3       11110
4       11114
        ...  
2433    11113
2434    11110
2435    11093
2436    11009
2437    11034
Length: 2438, dtype: int64

In [88]:
clustering

user_id,0,1,2,3,4,5,6,7,8,9,...,2428,2429,2430,2431,2432,2433,2434,2435,2436,2437
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,5.0,,...,,,,9.0,10.0,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,4.0,,...,,,,10.0,1.0,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,8.0,,,,,,7.0,,...,,,,1.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303057,,,,,,,,,,,...,,,,,,,,,,
303650,,,,,,,,,,,...,,,,,,,,,,
305682,,,,,,,,,,,...,,,,,,,,,,
307305,,,,,,,,,,,...,,,,,,,,,,


Rows = Users
Columns = Boardgames

In [90]:
clustering1 = clustering.fillna(0)

In [91]:
u, s, vt = svds(clustering1, k=8)

In [92]:
u.dot(np.diag(s).dot(vt))

array([[-2.65199027e-04,  2.13156769e-01,  2.03206268e-01, ...,
         1.02077730e-01,  4.72614313e-01, -3.02648740e-02],
       [-1.33866881e-04,  1.96626154e-02,  4.08406813e-02, ...,
         9.67691117e-03,  7.21559698e-02,  1.76199537e-02],
       [ 2.07973521e-03,  2.90072927e-01,  1.04198573e-01, ...,
         1.46370535e-01,  1.30219581e+00,  5.94359154e-01],
       ...,
       [-4.95463619e-05,  2.42649093e-03,  1.48328183e-03, ...,
        -2.00496740e-03, -6.12406311e-03,  7.94225042e-03],
       [ 1.35933107e-05, -2.48996766e-03,  2.42854513e-03, ...,
         2.67933375e-03,  1.16359572e-02,  8.59494868e-03],
       [ 1.55786262e-21, -7.56402149e-19, -7.92762495e-19, ...,
        -2.47150604e-19, -1.42118011e-18,  2.51016040e-19]])

In [93]:
users_prediction = np.round(u.dot(np.diag(s).dot(vt)))

In [94]:
users_prediction[0][440]

-0.0

- Rows = Users 
- Columns = Boardgames

In [95]:
clustering.shape

(11115, 2438)

In [96]:
# list(clustering.columns)

In [97]:
# list(range(0, 2438))

In [98]:
columns_index = dict(zip(list(clustering.columns), list(range(0, 2438))))

In [99]:
# Fazer o for loop pra pegar previsoes baseado no array "user_prediction"

In [100]:
testset = trainset.build_anti_testset()

In [101]:
predictions = svd.test(testset)

In [102]:
predictions

[Prediction(uid=0, iid=278, r_ui=7.0350295792, est=7.248734085463866, details={'was_impossible': False}),
 Prediction(uid=0, iid=12004, r_ui=7.0350295792, est=7.140395052233282, details={'was_impossible': False}),
 Prediction(uid=0, iid=9209, r_ui=7.0350295792, est=7.621029250206767, details={'was_impossible': False}),
 Prediction(uid=0, iid=13, r_ui=7.0350295792, est=7.4307862640390985, details={'was_impossible': False}),
 Prediction(uid=0, iid=118, r_ui=7.0350295792, est=7.936779701690626, details={'was_impossible': False}),
 Prediction(uid=0, iid=823, r_ui=7.0350295792, est=7.52410223440353, details={'was_impossible': False}),
 Prediction(uid=0, iid=3076, r_ui=7.0350295792, est=8.0940395962586, details={'was_impossible': False}),
 Prediction(uid=0, iid=18602, r_ui=7.0350295792, est=8.238026392002881, details={'was_impossible': False}),
 Prediction(uid=0, iid=5737, r_ui=7.0350295792, est=7.61381598984027, details={'was_impossible': False}),
 Prediction(uid=0, iid=15045, r_ui=7.035029

# First Simple Model

In [104]:
print(accuracy.rmse(predictions))

RMSE: 0.6028
0.6028279531336033


In [114]:
svd.predict(34, 3415)

Prediction(uid=34, iid=3415, r_ui=None, est=7.508444262990429, details={'was_impossible': False})

### Second try