In [3]:
import pandas as pd

In [4]:
grid = pd.read_csv('./Data/full_game_user_grid.csv')

In [5]:
grid = grid.drop(columns = 'viewer_count', axis = 1)

In [6]:
grid = grid.dropna()

In [7]:
grid['user_name'].nunique()

164397

In [8]:
grid = grid.dropna(how='any',axis=0) 

In [9]:
grid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323774 entries, 0 to 1326954
Data columns (total 6 columns):
user_name      1323774 non-null object
game_name      1323774 non-null object
game_genres    1323774 non-null object
language       1323774 non-null object
started_at     1323774 non-null object
max            1323774 non-null int64
dtypes: int64(1), object(5)
memory usage: 70.7+ MB


In [11]:
max_value_username = pd.DataFrame(grid.groupby('user_name')['max'].max().reset_index())

In [12]:
max_val_dict = max_value_username.groupby('user_name')['max'].apply(list).to_dict()

In [13]:
grid['max_game'] = grid['user_name'].map(max_val_dict)
grid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323774 entries, 0 to 1326954
Data columns (total 7 columns):
user_name      1323774 non-null object
game_name      1323774 non-null object
game_genres    1323774 non-null object
language       1323774 non-null object
started_at     1323774 non-null object
max            1323774 non-null int64
max_game       1323774 non-null object
dtypes: int64(1), object(6)
memory usage: 80.8+ MB


In [16]:
grid['max_game_int'] = grid.max_game.str[0].astype(int)
grid = grid.drop('max_game', axis = 1)
grid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323774 entries, 0 to 1326954
Data columns (total 7 columns):
user_name       1323774 non-null object
game_name       1323774 non-null object
game_genres     1323774 non-null object
language        1323774 non-null object
started_at      1323774 non-null object
max             1323774 non-null int64
max_game_int    1323774 non-null int64
dtypes: int64(2), object(5)
memory usage: 80.8+ MB


In [17]:
grid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1323774 entries, 0 to 1326954
Data columns (total 7 columns):
user_name       1323774 non-null object
game_name       1323774 non-null object
game_genres     1323774 non-null object
language        1323774 non-null object
started_at      1323774 non-null object
max             1323774 non-null int64
max_game_int    1323774 non-null int64
dtypes: int64(2), object(5)
memory usage: 80.8+ MB


In [18]:
grid['score'] = grid['max']/grid['max_game_int']
grid.sample(3)

Unnamed: 0,user_name,game_name,game_genres,language,started_at,max,max_game_int,score
532677,MrWilsonWilson,ARK,Action,en,2019-06-03 08:35:29,38,63,0.603175
177421,therealpyro2468,NBA 2K19,Sports Game,en,2019-06-05 11:53:49,0,5,0.0
151182,tottecsgo,Counter-Strike: Global Offensive,FPS,sv,2019-06-06 08:42:47,21,48,0.4375


In [19]:
from sklearn.preprocessing import minmax_scale
grid['scaled_score'] = minmax_scale(grid['score'], feature_range=(1, 5))

In [20]:
grid = grid.dropna()

In [21]:
grid.groupby('user_name')['scaled_score'].count().reset_index().sort_values('scaled_score', ascending=False)[:10]

Unnamed: 0,user_name,scaled_score
26697,Fonbet_RocketLeague,363
75147,StreamerHouse,363
119848,luke4316live,318
106982,gaules,305
26696,Fonbet_RLH,264
87152,WithoutAim,243
21372,DreamHackCS,240
57698,OgamingSC2,240
119026,livekiss,236
22301,ESL_CSGO,232


In [22]:
grid.groupby('game_genres')['scaled_score'].count().reset_index().sort_values('scaled_score', ascending=False)

Unnamed: 0,game_genres,scaled_score
0,Action,202098
29,Shooter,158760
7,FPS,126001
24,RPG,113292
15,MMORPG,85305
19,Open World,65098
33,Strategy,55020
30,Simulation,54030
1,Adventure Game,51842
2,Card & Board Game,45239


In [23]:
min_number_scores = 5
filter_users = grid['user_name'].value_counts() > min_number_scores
filter_users = filter_users[filter_users].index.tolist()

In [24]:
grid_new = grid[(grid['user_name'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(grid.shape))
print('The new data frame shape:\t{}'.format(grid_new.shape))

The original data frame shape:	(1304800, 9)
The new data frame shape:	(1083894, 9)


In [26]:
import pickle

In [27]:
pickle.dump(grid_new, open("./Data/grid_new.pkl", "wb" ) )

In [None]:
grid_new = pickle.load( open( "./Data/grid_new", "rb" ) )

In [28]:
grid_new = pickle.load( open( "./Data/grid_new", "rb" ) )

In [133]:
import surprise
from surprise import Dataset, accuracy, Reader
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

In [134]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(grid_new[['user_name', 'game_genres', 'scaled_score']], reader)

In [136]:
from surprise import NormalPredictor, BaselineOnly, SVD, SVDpp, SlopeOne, NMF, KNNBaseline,KNNBasic, KNNWithMeans, KNNWithZScore, CoClustering

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:


    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')  

Estimating biases using als...


In [None]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

In [None]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

In [73]:
trainset, testset = train_test_split(data, test_size=.25)

# Train the algorithm on the trainset, and predict ratings for the testset
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


In [74]:
surprise.accuracy.fcp(predictions, verbose=True)

FCP:  0.4923


0.4923476410542206

In [None]:
dump.dump('./dump_file', predictions, algo)
predictions, algo = dump.load('./dump_file')

####  Predicting Genres for Streamers (user-based similarities) ####

In [77]:
from collections import defaultdict

In [78]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [79]:
top_n = get_top_n(predictions, n=10)

In [80]:
get_top_n(pred)

defaultdict(list,
            {'opium930': [('Euro Truck Simulator 2', 0.9039090529960911)],
             'MrCruickums': [('Battlefield V', 0.9385727610519978)],
             'KendricSwissh': [('World of Warcraft', 0.9964585977214553),
              ('Total War: Three Kingdoms', 0.95813917972864)],
             '_리시안_': [('Just Chatting', 0.9506039242572153)],
             'Yimmo_': [('Mario Party 3', 1),
              ('The Legend of Zelda: Ocarina of Time', 0.886493458435246)],
             'Vordie': [('Food & Drink', 0.8974084655763193)],
             'FPSFLX': [("PLAYERUNKNOWN'S BATTLEGROUNDS", 0.9002812291126725)],
             'JeebusSC': [('Path of Exile', 0.9747030047149603)],
             '초보서포터': [('Lost Ark Online', 0.9066265519575645)],
             'MrFaragrim': [('Magic: The Gathering', 0.9761284723069288)],
             'MayChan': [('Art', 0.9387469758530322)],
             'Zepherina': [('FINAL FANTASY XIV Online', 0.9846527467128627)],
             'RussTTrumbone': [('

In [None]:
#for uid, user_ratings in top_n.items():
    #print(uid, [iid for (iid, _) in user_ratings])

#### Predicting Similar Genres Based on Current Genre (item-based similarity) #### 

In [None]:
# First, train the algortihm to compute the similarities between items
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> genre
rid_to_name, name_to_rid = read_genre_names()

# Retrieve inner id of the genre you are looking for
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)