In [41]:
import pandas as pd
import numpy as np

In [42]:
grid = pd.read_csv('./Data/final_game_user_grid_06_07_19.csv')

In [43]:
grid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1387492 entries, 0 to 1387491
Data columns (total 7 columns):
user_name       1387492 non-null object
game_name       1385813 non-null object
game_genres     1383564 non-null object
language        1387490 non-null object
started_at      1387492 non-null object
viewer_count    1387492 non-null int64
max             1387492 non-null int64
dtypes: int64(2), object(5)
memory usage: 74.1+ MB


In [13]:
grid = grid.dropna()

In [14]:
grid = grid.dropna(how='any',axis=0) 

In [15]:
grid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1383562 entries, 0 to 1387491
Data columns (total 7 columns):
user_name       1383562 non-null object
game_name       1383562 non-null object
game_genres     1383562 non-null object
language        1383562 non-null object
started_at      1383562 non-null object
viewer_count    1383562 non-null int64
max             1383562 non-null int64
dtypes: int64(2), object(5)
memory usage: 84.4+ MB


In [15]:
# To begin, we're calculating the metric we'll be using to compare users and games. Since we're focused on the streamers, we don't have the traditional "I like this movie so I will rate it a 4.5 out of 5.0" ratings. Instead we're calculating how many viewers each streamer attracted with a particular game compared to their max viewer potential over the last week. For each user, one game is their ultimate streaming "5 out of 5" benchmark, and all other games they play are compared to that one, and normalized to ratings between 1 and 5. We are also using all the genres for each game to pinpoint how successful various genres and games have been for each streamer.

In [16]:
max_value_username = pd.DataFrame(grid.groupby('user_name')['max'].max().reset_index())

In [17]:
max_val_dict = max_value_username.groupby('user_name')['max'].apply(list).to_dict()

In [18]:
grid['max_game'] = grid['user_name'].map(max_val_dict)

In [19]:
grid['max_game_int'] = grid.max_game.str[0].astype(int)
grid = grid.drop('max_game', axis = 1)

In [20]:
grid['score'] = grid['max']/grid['max_game_int']

Unnamed: 0,user_name,game_name,game_genres,language,started_at,viewer_count,max,max_game_int,score
1122503,Chisaihato,Apex Legends,FPS,en,2019-06-03 05:00:25,16,16,62,0.258065
1254963,Anime_Boyfriend,Apex Legends,Shooter,en,2019-06-07 00:58:26,30,30,30,1.0
174748,Thrash2469,Counter-Strike: Global Offensive,FPS,en,2019-06-04 00:30:24,11,11,11,1.0


In [22]:
from sklearn.preprocessing import minmax_scale
grid['scaled_score'] = minmax_scale(grid['score'], feature_range=(1, 5))

In [23]:
grid = grid.dropna()

In [13]:
# now we have a listing for each user pairing them with each game they play, what genre it belongs to, and how many people watched them play each game compared to the max viewers they ever got for a stream during the week we examined.

In [26]:
grid.groupby('user_name')['scaled_score'].count().reset_index().sort_values('scaled_score', ascending=False)[:5]

Unnamed: 0,user_name,scaled_score
27473,Fonbet_RocketLeague,378
77300,StreamerHouse,375
123375,luke4316live,333
110114,gaules,315
27472,Fonbet_RLH,270


In [27]:
grid.groupby('game_genres')['scaled_score'].count().reset_index().sort_values('scaled_score', ascending=False)[:5]

Unnamed: 0,game_genres,scaled_score
0,Action,211633
29,Shooter,165931
7,FPS,131286
24,RPG,118595
15,MMORPG,88901


In [28]:
grid.groupby('game_name')['scaled_score'].count().reset_index().sort_values('scaled_score', ascending=False)[:5]

Unnamed: 0,game_name,scaled_score
128,Grand Theft Auto V,37860
180,Minecraft,35715
243,Rocket League,35043
37,Black Desert Online,33723
97,Escape From Tarkov,33366


In [29]:
min_number_scores = 5
filter_users = grid['user_name'].value_counts() > min_number_scores
filter_users = filter_users[filter_users].index.tolist()

In [30]:
grid_new = grid[(grid['user_name'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(grid.shape))
print('The new data frame shape:\t{}'.format(grid_new.shape))

The original data frame shape:	(1364027, 10)
The new data frame shape:	(1137673, 10)


In [10]:
# After reshaping the grid, we are going to extract the recommendations for game genre, game titles, and games similar to those already rated by the streamers as a three-pronged recommender approach.

In [5]:
import pickle

In [33]:
pickle.dump(grid_new, open("./Data/final_grid_06_07_19.pkl", "wb" ) )

In [6]:
grid_new = pickle.load( open( "./Data/final_grid_06_07_19.pkl", "rb" ) )

In [7]:
grid_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1137673 entries, 0 to 1387491
Data columns (total 10 columns):
user_name       1137673 non-null object
game_name       1137673 non-null object
game_genres     1137673 non-null object
language        1137673 non-null object
started_at      1137673 non-null object
viewer_count    1137673 non-null int64
max             1137673 non-null int64
max_game_int    1137673 non-null int64
score           1137673 non-null float64
scaled_score    1137673 non-null float64
dtypes: float64(2), int64(3), object(5)
memory usage: 95.5+ MB


In [7]:
grid_new.head(5)

Unnamed: 0,user_name,game_name,game_genres,language,started_at,viewer_count,max,max_game_int,score,scaled_score
0,龜狗,Grand Theft Auto V,Action,zh,2019-06-06 07:30:59,1735,1735,1735,1.0,5.0
1,龜狗,Grand Theft Auto V,Open World,zh,2019-06-06 07:30:59,1735,1735,1735,1.0,5.0
2,龜狗,Grand Theft Auto V,Driving/Racing Game,zh,2019-06-06 07:30:59,1735,1735,1735,1.0,5.0
3,龜狗,Grand Theft Auto V,Open World,zh,2019-06-06 07:30:59,1589,1589,1735,0.91585,4.663401
4,龜狗,Grand Theft Auto V,Action,zh,2019-06-06 07:30:59,1589,1589,1735,0.91585,4.663401


In [11]:
# preparing for genre recommendations based on viewership scores:

In [8]:
genres = grid_new.groupby(by = ['user_name', 'game_genres'])['scaled_score'].agg([np.mean])
games = grid_new.groupby(by = ['user_name', 'game_name'])['scaled_score'].agg([np.mean])

In [9]:
genres.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
user_name,game_genres,Unnamed: 2_level_1
00NothingLabs,Fighting,2.421053
00NothingLabs,Open World,1.578947
00NothingLabs,RPG,1.578947
00NothingLabs,Shooter,1.578947
00elu00,Action,4.0


In [456]:
games.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean
user_name,game_name,Unnamed: 2_level_1
00NothingLabs,Mortal Kombat 11,2.421053
00NothingLabs,Tom Clancy's The Division 2,1.578947
00elu00,Dead by Daylight,4.666667
00elu00,Deathgarden,2.0
01joga,PUBG MOBILE,3.75


In [10]:
genres.columns = genres.columns.map(''.join)
games.columns = games.columns.map(''.join)

In [11]:
genres = genres.reset_index()
games = games.reset_index()

In [466]:
genres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183217 entries, 0 to 183216
Data columns (total 3 columns):
user_name      183217 non-null object
game_genres    183217 non-null object
mean           183217 non-null float64
dtypes: float64(1), object(2)
memory usage: 4.2+ MB


In [467]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93606 entries, 0 to 93605
Data columns (total 3 columns):
user_name    93606 non-null object
game_name    93606 non-null object
mean         93606 non-null float64
dtypes: float64(1), object(2)
memory usage: 2.1+ MB


In [468]:
genres.head(5)

Unnamed: 0,user_name,game_genres,mean
0,00NothingLabs,Fighting,2.421053
1,00NothingLabs,Open World,1.578947
2,00NothingLabs,RPG,1.578947
3,00NothingLabs,Shooter,1.578947
4,00elu00,Action,4.0


In [343]:
games.head(5)

Unnamed: 0,user_name,game_name,mean
0,00NothingLabs,Mortal Kombat 11,2.421053
1,00NothingLabs,Tom Clancy's The Division 2,1.578947
2,00elu00,Dead by Daylight,4.666667
3,00elu00,Deathgarden,2.0
4,01joga,PUBG MOBILE,3.75


In [31]:
#using Surprise to predict genres/games for a streamer based on their existing genre ratings:

In [12]:
import surprise
from surprise import Dataset, accuracy, Reader, NMF, NormalPredictor, BaselineOnly, CoClustering, SlopeOne, SVD, KNNBaseline
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

In [13]:
reader = Reader(rating_scale=(1, 5))
genre_data = Dataset.load_from_df(genres[['user_name', 'game_genres', 'mean']], reader)

In [14]:
game_data = Dataset.load_from_df(games[['user_name', 'game_name', 'mean']], reader)

In [15]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SlopeOne(), NormalPredictor(), BaselineOnly(), NMF(), CoClustering(), SVD()]:


    # Perform cross validation
    results = cross_validate(algorithm, genre_data, measures=['RMSE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SlopeOne,0.582375,0.573649,0.371326
NMF,0.652415,11.83246,0.290575
SVD,0.6851,7.95684,0.276936
BaselineOnly,0.735609,0.448382,0.196
CoClustering,0.78234,7.239472,0.331366
NormalPredictor,1.103746,0.204488,0.243219


In [345]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SlopeOne(), NormalPredictor(), BaselineOnly(), NMF(), CoClustering(), SVD()]:


    # Perform cross validation
    results = cross_validate(algorithm, game_data, measures=['RMSE'], cv=5, verbose=True)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

Evaluating RMSE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9578  0.9627  0.9627  0.9575  0.9611  0.9604  0.0023  
Fit time          0.50    0.53    0.53    0.53    0.51    0.52    0.01    
Test time         0.17    0.14    0.13    0.14    0.13    0.14    0.01    
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1552  1.1548  1.1527  1.1556  1.1477  1.1532  0.0029  
Fit time          0.09    0.12    0.12    0.12    0.11    0.11    0.01    
Test time         0.14    0.11    0.10    0.10    0.10    0.11    0.01    
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std   

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.798742,0.287553,0.102698
SVD,0.804327,4.396158,0.106202
SlopeOne,0.960365,0.520149,0.143846
NMF,0.976619,7.758592,0.191688
CoClustering,1.027581,5.492498,0.09684
NormalPredictor,1.153218,0.111542,0.11117


In [43]:
# Based on the results, we will proceed with SlopeOne for genre predictions. More information about SlopeOne can be found here: https://arxiv.org/abs/cs/0702144

In [346]:
# Based on the results, we will proceed with BaselineOnly for game predictions.

In [16]:
algo_genre = SlopeOne()
cross_validate(algo_genre, genre_data, measures=['RMSE'], cv=7, verbose=True)

Evaluating RMSE of algorithm SlopeOne on 7 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Mean    Std     
RMSE (testset)    0.5701  0.5765  0.5663  0.5753  0.5635  0.5708  0.5655  0.5697  0.0046  
Fit time          0.52    0.61    0.60    0.61    0.60    0.57    0.56    0.58    0.03    
Test time         0.23    0.34    0.23    0.23    0.34    0.23    0.33    0.27    0.05    


{'test_rmse': array([0.57014546, 0.57649142, 0.56626513, 0.57528173, 0.5635193 ,
        0.57076932, 0.56554553]),
 'fit_time': (0.5240540504455566,
  0.6088509559631348,
  0.5955979824066162,
  0.6124591827392578,
  0.6031978130340576,
  0.5743191242218018,
  0.562340259552002),
 'test_time': (0.22563791275024414,
  0.34035491943359375,
  0.23003911972045898,
  0.22739100456237793,
  0.33681726455688477,
  0.22581791877746582,
  0.33237695693969727)}

In [17]:
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo_games = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo_games, game_data, measures=['RMSE'], cv=7, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 7 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Mean    Std     
RMSE (testset)    0.8027  0.7937  0.7926  0.7967  0.8050  0.7989  0.7933  0.7976  0.0045  
Fit time          0.14    0.17    0.17    0.17    0.18    0.21    0.17    0.17    0.02    
Test time         0.07    0.06    0.06    0.19    0.19    0.18    0.20    0.14    0.06    


{'test_rmse': array([0.80266606, 0.79368568, 0.79255473, 0.79669737, 0.80504617,
        0.79887923, 0.79334187]),
 'fit_time': (0.1444869041442871,
  0.17248296737670898,
  0.17104792594909668,
  0.1702878475189209,
  0.18003511428833008,
  0.20632290840148926,
  0.17349600791931152),
 'test_time': (0.06609511375427246,
  0.06372213363647461,
  0.06406188011169434,
  0.19368600845336914,
  0.18844985961914062,
  0.18450093269348145,
  0.1974625587463379)}

In [19]:
genre_trainset, genre_testset = train_test_split(genre_data, test_size=0.25)
algo_genre = SlopeOne()
genre_predictions = algo_genre.fit(genre_trainset).test(genre_testset)
accuracy.rmse(genre_predictions)

RMSE: 0.5929


0.5928608441524472

In [20]:
game_trainset, game_testset = train_test_split(game_data, test_size=0.25)
algo_games = BaselineOnly(bsl_options=bsl_options)
game_predictions = algo_games.fit(game_trainset).test(game_testset)
accuracy.rmse(game_predictions)

Estimating biases using als...
RMSE: 0.7951


0.7950523433682255

In [376]:
pickle.dump(algo_genre, open("./Data/SlopeOne_genre_model.pkl", "wb" ) )
algo_genre = pickle.load( open( "./Data/SlopeOne_genre_model.pkl", "rb" ) )

In [377]:
pickle.dump(algo_games, open("./Data/BaselineOnly_game_model.pkl", "wb" ) )
algo_games = pickle.load( open( "./Data/BaselineOnly_game_model.pkl", "rb" ) )

In [378]:
pickle.dump(genre_predictions, open("./Data/SlopeOne_genre_model_predictions.pkl", "wb" ) )
genre_predictions = pickle.load( open( "./Data/SlopeOne_genre_model_predictions.pkl", "rb" ) )

In [379]:
pickle.dump(game_predictions, open("./Data/BaselineOnly_game_model_predictions.pkl", "wb" ) )
game_predictions = pickle.load( open( "./Data/BaselineOnly_game_model_predictions.pkl", "rb" ) )

In [380]:
pickle.dump(genres, open("./Data/genres.pkl", "wb" ) )
genres = pickle.load( open( "./Data/genres.pkl", "rb" ) )

In [381]:
pickle.dump(games, open("./Data/games.pkl", "wb" ) )
games = pickle.load( open( "./Data/games.pkl", "rb" ) )

In [23]:
streamer_name = input('What is your streamer name? ')
streamer_genres = list(input ('Which game genres do you currently stream? ').split(', '))
streamer_games = list(input ('Which games do you currently steam? ').split(', '))

What is your streamer name? 龜狗
Which game genres do you currently stream? Driving/Racing Game, Action
Which games do you currently steam? Grand Theft Auto V


In [24]:
streamer_name, streamer_genres, streamer_games

('龜狗', ['Driving/Racing Game', 'Action'], ['Grand Theft Auto V'])

In [51]:
#Making genre/game predictions for a specific streamer based on their user name:

In [25]:
genres.head()

Unnamed: 0,user_name,game_genres,mean
0,00NothingLabs,Fighting,2.421053
1,00NothingLabs,Open World,1.578947
2,00NothingLabs,RPG,1.578947
3,00NothingLabs,Shooter,1.578947
4,00elu00,Action,4.0


In [26]:
def display_current_genres(streamer_name):
    user_genres = list(genres[genres['user_name']==streamer_name]['game_genres'])
    return user_genres
recorder_genres_list = display_current_genres(streamer_name)
full_genres = set(recorder_genres_list + streamer_genres)
full_genres

{'Action', 'Driving/Racing Game', 'IRL', 'MOBA', 'Open World'}

In [27]:
def display_current_games(streamer_name):
    user_games = list(games[games['user_name']==streamer_name]['game_name'])
    return user_games
recorder_games_list = display_current_games(streamer_name)
full_games = set(recorder_games_list + streamer_games)
full_games

{'Grand Theft Auto V', 'Just Chatting', 'League of Legends'}

####  Predicting Genres for Streamers (user-based similarities) ####

In [3]:
from collections import defaultdict

In [385]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [31]:
iids_genre = genres['game_genres'].unique()
iids_genre_to_predict = np.setdiff1d(iids_genre, full_genres)

In [32]:
iids_game = games['game_name'].unique()
iids_game_to_predict = np.setdiff1d(iids_game, full_games)

In [33]:
iids_genre_to_predict

array(['Action', 'Adventure Game', 'Card & Board Game', 'Compilation',
       'Creative', 'Driving/Racing Game', 'Educational Game', 'FPS',
       'Fighting', 'Flight Simulator', 'Gambling Game', 'Hidden Objects',
       'Horror', 'IRL', 'Indie Game', 'MMORPG', 'MOBA', 'Metroidvania',
       'NONE', 'Open World', 'Platformer', 'Point and Click', 'Puzzle',
       'RETROGAMEPLACEHOLDER', 'RPG', 'RTS', 'Rhythm & Music Game',
       'Roguelike', 'Series: Souls', 'Shooter', 'Simulation',
       'Sports Game', 'Stealth', 'Strategy', 'Survival', 'Visual Novel'],
      dtype=object)

In [34]:
iids_game_to_predict

array(['7 Days to Die', 'A Hat in Time', 'A Plague Tale: Innocence',
       'A Way Out', 'ARK', 'ASMR', 'ATLAS', 'Agar.io',
       'Age of Empires II', 'Age of Mythology', 'Albion Online',
       'Alien: Isolation', 'Always On', 'American Truck Simulator',
       'Anno 1800', 'Apex Legends', 'Arena of Valor', 'Arma 3', 'Art',
       "Assassin's Creed II", "Assassin's Creed III",
       "Assassin's Creed Odyssey", 'Assetto Corsa Competizione',
       'Astroneer', 'Auto Chess', 'BaBa Is You', 'Bad North',
       'Banjo-Kazooie', 'Barotrauma', 'Battalion 1944', 'Battle Brothers',
       'Battlefield 1', 'Battlefield 4', 'Battlefield V', 'Beat Saber',
       'BioShock', 'Black Desert Online', 'Blackjack', 'Blood & Truth',
       'Bloodborne', 'Borderlands 2', 'Brawl Stars', 'Brawlhalla',
       'Call of Duty: Black Ops 4', 'Call of Duty: Black Ops II',
       'Call of Duty: Black Ops III', 'Call of Duty: Modern Warfare 3',
       'Call of Duty: Modern Warfare Remastered', 'Call of Duty: WW

In [35]:
genre_testset_personal = [[streamer_name, iid, 0.] for iid in iids_genre_to_predict]
game_testset_personal = [[streamer_name, iid, 0.] for iid in iids_game_to_predict]

In [36]:
genre_testset_personal

[['龜狗', 'Action', 0.0],
 ['龜狗', 'Adventure Game', 0.0],
 ['龜狗', 'Card & Board Game', 0.0],
 ['龜狗', 'Compilation', 0.0],
 ['龜狗', 'Creative', 0.0],
 ['龜狗', 'Driving/Racing Game', 0.0],
 ['龜狗', 'Educational Game', 0.0],
 ['龜狗', 'FPS', 0.0],
 ['龜狗', 'Fighting', 0.0],
 ['龜狗', 'Flight Simulator', 0.0],
 ['龜狗', 'Gambling Game', 0.0],
 ['龜狗', 'Hidden Objects', 0.0],
 ['龜狗', 'Horror', 0.0],
 ['龜狗', 'IRL', 0.0],
 ['龜狗', 'Indie Game', 0.0],
 ['龜狗', 'MMORPG', 0.0],
 ['龜狗', 'MOBA', 0.0],
 ['龜狗', 'Metroidvania', 0.0],
 ['龜狗', 'NONE', 0.0],
 ['龜狗', 'Open World', 0.0],
 ['龜狗', 'Platformer', 0.0],
 ['龜狗', 'Point and Click', 0.0],
 ['龜狗', 'Puzzle', 0.0],
 ['龜狗', 'RETROGAMEPLACEHOLDER', 0.0],
 ['龜狗', 'RPG', 0.0],
 ['龜狗', 'RTS', 0.0],
 ['龜狗', 'Rhythm & Music Game', 0.0],
 ['龜狗', 'Roguelike', 0.0],
 ['龜狗', 'Series: Souls', 0.0],
 ['龜狗', 'Shooter', 0.0],
 ['龜狗', 'Simulation', 0.0],
 ['龜狗', 'Sports Game', 0.0],
 ['龜狗', 'Stealth', 0.0],
 ['龜狗', 'Strategy', 0.0],
 ['龜狗', 'Survival', 0.0],
 ['龜狗', 'Visual Novel

In [37]:
genre_predictions = algo_genre.test(genre_testset_personal)
game_predictions = algo_games.test(game_testset_personal)

In [38]:
genre_predictions[:3]

[Prediction(uid='龜狗', iid='Action', r_ui=0.0, est=3.3634996708756173, details={'was_impossible': False}),
 Prediction(uid='龜狗', iid='Adventure Game', r_ui=0.0, est=3.3609682666656693, details={'was_impossible': False}),
 Prediction(uid='龜狗', iid='Card & Board Game', r_ui=0.0, est=3.118685128563536, details={'was_impossible': False})]

In [39]:
game_predictions[:3]

[Prediction(uid='龜狗', iid='7 Days to Die', r_ui=0.0, est=3.488430105861643, details={'was_impossible': False}),
 Prediction(uid='龜狗', iid='A Hat in Time', r_ui=0.0, est=3.4908061229726934, details={'was_impossible': False}),
 Prediction(uid='龜狗', iid='A Plague Tale: Innocence', r_ui=0.0, est=3.189140341292573, details={'was_impossible': False})]

In [480]:
top_n_genres = get_top_n(genre_predictions)
top_n_games = get_top_n(game_predictions)

In [481]:
top_n_genres

defaultdict(list,
            {'龜狗': [('Hidden Objects', 5),
              ('Gambling Game', 3.6368098440144547),
              ('Survival', 3.5519132591178697),
              ('Flight Simulator', 3.50188920909382),
              ('Platformer', 3.4945232688514736),
              ('Creative', 3.4886907616846927),
              ('Driving/Racing Game', 3.4699744712345177),
              ('Series: Souls', 3.4098002976098907),
              ('IRL', 3.4052196384712934),
              ('Open World', 3.399279093516809)]})

In [482]:
top_n_games

defaultdict(list,
            {'龜狗': [('The Jackbox Party Pack 3', 4.017429204878657),
              ('Silent Hill', 3.9244264580051684),
              ('Minecraft', 3.916206247256748),
              ('Pokémon Ultra Sun/Ultra Moon', 3.9092810425061604),
              ('Team Fortress 2', 3.9083244156306503),
              ('Satisfactory', 3.8917342868059484),
              ('Trials Rising', 3.8833243302074334),
              ("Don't Starve Together", 3.8816117997251114),
              ('Detroit: Become Human', 3.8761385962225434),
              ('Just Dance 2019', 3.8760945013930908)]})

In [487]:
for uid, user_ratings in top_n_genres.items():
    print('For ' + uid + ', the recommended genres are:'+ str([iid for (iid, _) in user_ratings]))
genre_user_based_list = [iid for (iid, _) in user_ratings]
genre_user_based_list

For 龜狗, the recommended genres are:['Hidden Objects', 'Gambling Game', 'Survival', 'Flight Simulator', 'Platformer', 'Creative', 'Driving/Racing Game', 'Series: Souls', 'IRL', 'Open World']


['Hidden Objects',
 'Gambling Game',
 'Survival',
 'Flight Simulator',
 'Platformer',
 'Creative',
 'Driving/Racing Game',
 'Series: Souls',
 'IRL',
 'Open World']

In [488]:
for uid, user_ratings in top_n_games.items():
    print('For ' + uid + ', the recommended games are:'+ str([iid for (iid, _) in user_ratings]))

For 龜狗, the recommended games are:['The Jackbox Party Pack 3', 'Silent Hill', 'Minecraft', 'Pokémon Ultra Sun/Ultra Moon', 'Team Fortress 2', 'Satisfactory', 'Trials Rising', "Don't Starve Together", 'Detroit: Become Human', 'Just Dance 2019']


In [489]:
game_user_based_list = [iid for (iid, _) in user_ratings]
game_user_based_list

['The Jackbox Party Pack 3',
 'Silent Hill',
 'Minecraft',
 'Pokémon Ultra Sun/Ultra Moon',
 'Team Fortress 2',
 'Satisfactory',
 'Trials Rising',
 "Don't Starve Together",
 'Detroit: Become Human',
 'Just Dance 2019']

#### Predicting Similar Genres Based on Current Genre/Game (item-based similarity) #### 

In [196]:
import io  # needed because of weird encoding of u.item file
from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir

In [490]:
genre_group = grid_new.groupby(by = ['game_genres', 'user_name'])['scaled_score'].agg([np.mean])
game_group = grid_new.groupby(by = ['game_name', 'user_name'])['scaled_score'].agg([np.mean])

In [491]:
genre_group = genre_group.reset_index()
game_group = game_group.reset_index()

In [410]:
pickle.dump(genre_group, open("./Data/genre_group.pkl", "wb" ) )
genre_group = pickle.load( open( "./Data/genre_group.pkl", "rb" ) )

pickle.dump(game_group, open("./Data/game_group.pkl", "wb" ) )
game_group = pickle.load( open( "./Data/game_group.pkl", "rb" ) )

In [493]:
genre_group = pickle.load( open( "./Data/genre_group.pkl", "rb" ) )
reader = Reader(rating_scale=(1, 5))
genre_data = Dataset.load_from_df(genre_group[['user_name', 'game_genres', 'mean']], reader)
genre_trainset = genre_data.build_full_trainset()

game_group = pickle.load( open( "./Data/game_group.pkl", "rb" ) )
reader = Reader(rating_scale=(1, 5))
game_data = Dataset.load_from_df(game_group[['user_name', 'game_name', 'mean']], reader)
game_trainset = game_data.build_full_trainset()

In [494]:
#Fit the KNN algorithm to the data
sim_options = {'name': 'pearson_baseline',
               'shrinkage': 0, 'user_based': False  # no shrinkage
               }
algo_genre_group = KNNBaseline(sim_options=sim_options, verbose = False)
algo_genre_group.fit(genre_trainset)


algo_game_group = KNNBaseline(sim_options=sim_options, verbose = False)
algo_game_group.fit(game_trainset)

<surprise.prediction_algorithms.knns.KNNBaseline at 0x14578c0f0>

In [495]:
#produce the list of genres/games needed to be evaluated

genre_inner_id_list = []
for genre in full_genres:
    inner = algo_genre_group.trainset.to_inner_iid(genre)
    genre_inner_id_list.append(inner)

game_inner_id_list = []
for game in full_games:
    inner = algo_game_group.trainset.to_inner_iid(game)
    game_inner_id_list.append(inner)

ValueError: Item Grand Theft Auto 5 is not part of the trainset.

In [423]:
# Retrieve inner ids of the nearest neighbors of the game in question.
genre_neighbors_list = []
for inner in genre_inner_id_list:
    genre_neighbors = algo_genre_group.get_neighbors(inner, k=3)
    genre_neighbors_list.append(genre_neighbors)

game_neighbors_list = []
for inner in game_inner_id_list:
    game_neighbors = algo_game_group.get_neighbors(inner, k=3)
    game_neighbors_list.append(game_neighbors)

In [424]:
print(genre_neighbors_list)
print(game_neighbors_list)

[[28, 15, 19], [6, 28, 1]]
[[251, 161, 67]]


In [425]:
# prioritize closest neighbors to all original genres/games mentioned

genre_final_list = []
for item in genre_neighbors_list:
    genre_final_list.append(item[0])
    genre_final_list.append(item[1])

game_final_list = []
for item in game_neighbors_list:
    game_final_list.append(item[0])
    game_final_list.append(item[1])

In [426]:
print(genre_final_list)
print(game_final_list)

[28, 15, 6, 28]
[251, 161]


In [427]:
#come up with a way to weigh the most frequent neighbors in all genres/games and combine with user-recommended ones.

In [429]:
genres = [algo_genre_group.trainset.to_raw_iid(iiid) for iiid in set(genre_final_list)]

games = [algo_game_group.trainset.to_raw_iid(iiid) for iiid in set(game_final_list)]
print('The nearest neighbors of your current genres are:' + str(genres))
print('The nearest neighbors of your current games are:' + str(games))

The nearest neighbors of your current genres are:['Series: Souls', 'Educational Game', 'MMORPG']
The nearest neighbors of your current games are:['Makers & Crafting', 'Sonic the Hedgehog']


In [335]:
print (', '.join(item for item in set(genres + user_based_list)))

Fighting, Gambling Game, Platformer, Visual Novel, Point and Click, Strategy, MMORPG, Sports Game, Survival, MOBA, Driving/Racing Game, Creative, Series: Souls, FPS, Adventure Game, RETROGAMEPLACEHOLDER


In [471]:
# combined outputs of both methods to produce results common in both
print("We recommend you stream the following genres: " + ', '.join(item for item in set(genres + genre_user_based_list)))
print("We recommend you stream the following games: " + ', '.join(item for item in set(games + game_user_based_list)))


ValueError: Unable to coerce to Series, length must be 3: given 10