# Project

Michael Wendell

## Setup

Load in required imports for data organization and graphical formatting.

In [1]:
from pathlib import Path
import re

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from lenskit import topn

In [4]:
import gzip

In [5]:
import seedbank
seedbank.initialize(20230306)
rng = seedbank.numpy_rng()

### Data Load
Data retrieved from formatted source on kaggle.
https://www.kaggle.com/datasets/antonkozyriev/game-recommendations-on-steam?select=recommendations.csv

In [4]:
ratings = pd.read_csv('data/recommendations.csv')
ratings

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,975370,0,0,2022-12-12,True,36.3,21265,0
1,304390,4,0,2017-02-17,False,11.5,1166,1
2,1085660,2,0,2019-11-17,True,336.5,97101,2
3,703080,0,0,2022-09-23,True,27.4,99068,3
4,526870,0,0,2021-01-10,True,7.9,9721,4
...,...,...,...,...,...,...,...,...
11480207,311690,0,0,2020-09-17,True,16.1,3250115,11480207
11480208,1449850,0,0,2022-02-01,False,0.7,42845,11480208
11480209,960090,3,0,2020-11-26,True,200.2,190650,11480209
11480210,311690,0,0,2018-01-07,True,5.7,4429817,11480210


In [5]:
ratings[['app_id', 'user_id']].nunique()

app_id        1874
user_id    6053352
dtype: int64

First, we will drop data we aren't using and change rating to a integer value.

In [26]:
ratings = ratings.drop(['helpful', 'funny', 'date', 'review_id'], axis=1)

In [27]:
ratings['rating'] = ratings['is_recommended'] * 1

In [28]:
ratings

Unnamed: 0,app_id,is_recommended,hours,user_id,rating
0,975370,True,36.3,21265,1
1,304390,False,11.5,1166,0
2,1085660,True,336.5,97101,1
3,703080,True,27.4,99068,1
4,526870,True,7.9,9721,1
...,...,...,...,...,...
11480207,311690,True,16.1,3250115,1
11480208,1449850,False,0.7,42845,0
11480209,960090,True,200.2,190650,1
11480210,311690,True,5.7,4429817,1


In [29]:
playtime = ratings.groupby(['app_id']).mean(['hours'])
playtime

Unnamed: 0_level_0,is_recommended,hours,user_id,rating
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
30,0.904965,85.698440,3.401339e+06,0.904965
60,0.855105,16.416272,3.345418e+06,0.855105
70,0.964501,36.690691,3.149683e+06,0.964501
220,0.982737,39.984859,3.155729e+06,0.982737
400,0.997305,20.179704,3.257392e+06,0.997305
...,...,...,...,...
2179380,0.960000,9.900000,2.353501e+06,0.960000
2190390,0.833333,64.933333,3.349545e+06,0.833333
2208920,0.578642,46.661707,3.077660e+06,0.578642
2211280,0.677419,10.758065,2.732957e+06,0.677419


There are 1874 different games in the dataset.

In [30]:
rev_num = ratings.groupby(['user_id'])['rating'].count()
rev_num

user_id
0          2
1          1
2          1
3          2
4          2
          ..
6230639    1
6230640    1
6230641    1
6230642    1
6230643    1
Name: rating, Length: 6053352, dtype: int64

There are 6,053,352 different users in the dataset.

I want to look at whether hours spent in a game creates a more impacful rating, so below applies a weighted rating based off the user recommending a game and the hours played in comparison to the average for all user that played the game.

In [31]:
for index, row in ratings.iterrows():
    if (row['is_recommended'] == False):
        ratings.at[index, 'rating'] = 0
    elif (row['hours'] >= playtime.loc[row['app_id']]['hours']):
        ratings.at[index, 'rating'] = 5
    elif (row['hours'] >= 0.8 * playtime.loc[row['app_id']]['hours']):
         ratings.at[index, 'rating'] = 4
    elif(row['hours'] >= 0.5 * playtime.loc[row['app_id']]['hours']):
         ratings.at[index, 'rating'] = 3
    elif(row['hours'] >= 0.2 * playtime.loc[row['app_id']]['hours']):
         ratings.at[index, 'rating'] = 2
    else:
         ratings.at[index, 'rating'] = 1

In [32]:
ratings

Unnamed: 0,app_id,is_recommended,hours,user_id,rating
0,975370,True,36.3,21265,2
1,304390,False,11.5,1166,0
2,1085660,True,336.5,97101,4
3,703080,True,27.4,99068,2
4,526870,True,7.9,9721,1
...,...,...,...,...,...
11480207,311690,True,16.1,3250115,1
11480208,1449850,False,0.7,42845,0
11480209,960090,True,200.2,190650,5
11480210,311690,True,5.7,4429817,1


Rename the columns for usabillity.

In [34]:
ratings = ratings.rename(columns={'app_id':'item', 'user_id':'user'})

The metadata for game genres was stored in a seperate file which is read below, orginized in a usablle format and stored.

In [29]:
metadata = pd.read_json('data/games_metadata.json', lines=True)
metadata = metadata.drop(['description'], axis=1)
metadata = metadata.rename(columns={'app_id':'item', 'tags':'genre'})
metadata = metadata.explode('genre')
metadata.to_parquet('data/genres.parquet', index=False, compression='zstd')

The cells below re-read the files and alter the value types in an attempt to lower the size, so pytorch could run with more features.

In [9]:
genres =  pd.read_parquet('data/genres.parquet')
devs =  pd.read_parquet('data/devs_full.parquet')

In [19]:
genres['genre_id'] = pd.factorize(genres['genre_id'])[0]

In [21]:
genres.to_parquet('data/genres.parquet', index=False, compression='zstd')

In [22]:
devs['author_id'] = pd.factorize(devs['author_id'])[0]

In [23]:
dev_rat = pd.DataFrame(allratings.item.unique(), columns=['item'])
devs2 = pd.merge(devs, dev_rat, on='item', how='inner')
devs2.to_parquet('data/devs.parquet', index=False, compression='zstd')

In [25]:
genres =  pd.read_parquet('data/genres.parquet')
devs['item'] = devs['item'].astype(np.int32)
devs['author_id'] = devs['author_id'].astype(np.int32)

genres['item'] = genres['item'].astype(np.int32)
genres['genre_id'] = genres['genre_id'].astype(np.int32)

devs.to_parquet('data/devs.parquet', index=False, compression='zstd')
genres.to_parquet('data/genres.parquet', index=False, compression='zstd')

### Splitting Data

Here we split the data to create testing and training sets, so we can then evaluate the performance of each algorithm.

In [24]:
from sklearn.model_selection import train_test_split

In [18]:
allratings = pd.read_parquet('data/rating_weighted.parquet')

In [26]:
allratings = allratings.drop(columns=['hours', 'is_recommended'])
allratings

Unnamed: 0,item,user,rating
0,975370,21265,2
1,304390,1166,0
2,1085660,97101,4
3,703080,99068,2
4,526870,9721,1
...,...,...,...
11480207,311690,3250115,1
11480208,1449850,42845,0
11480209,960090,190650,5
11480210,311690,4429817,1


In [27]:
allratings['item'] = allratings['item'].astype(np.int32)
allratings['user'] = allratings['user'].astype(np.int32)

In [28]:
train, test  = train_test_split(allratings, test_size=0.40, random_state=42)

In [29]:
train_users = pd.DataFrame(train.groupby('user')['item'].count())
train_users = train_users.reset_index()
train_users = train_users.drop(['item'], axis=1)

In [30]:
test = pd.merge(test, train_users, on='user', how='inner')

We merge the test set with the training users so we can make sure that we are only using users that we have at least one rating for in the training data.

In [31]:
test

Unnamed: 0,item,user,rating
0,1290000,1477295,3
1,526870,1477295,5
2,252490,365647,5
3,239030,365647,4
4,420290,1379285,3
...,...,...,...
2555222,1449850,1688188,0
2555223,252490,4540833,5
2555224,546560,2835221,3
2555225,1056640,1019106,5


In [32]:
test[['user', 'item']].nunique()

user    1359562
item       1858
dtype: int64

There are still 1,359,562 users in the testing set and 1,858 games, so we will select 25,000 for each dev set and 50,000 for the eval.

In [33]:
USERS_PER_SAMPLE = 25000
users = np.unique(test['user'])

In [34]:
users = rng.permutation(users)

The dev and eval sets are then created and stored as well as the training data.

In [35]:
dev_users = users[:USERS_PER_SAMPLE]
eval_users = users[USERS_PER_SAMPLE:(USERS_PER_SAMPLE*2)]

In [36]:
dev_actions = test[test['user'].isin(dev_users)]
eval_actions = test[test['user'].isin(eval_users)]

In [37]:
dev_actions = dev_actions[['item', 'user', 'rating']]
eval_actions = eval_actions[['item', 'user', 'rating']]

In [38]:
dev_actions.to_parquet('data/a3-dev-actions.parquet', index=False, compression='zstd')

In [39]:
eval_actions.to_parquet('data/a3-eval-actions.parquet', index=False, compression='zstd')

In [40]:
train.to_parquet('data/a3-train-actions.parquet', index=False, compression='zstd')

A combination of both the dev and eval was also saved for future use.

In [41]:
frames = [dev_actions, eval_actions]
rating = pd.concat(frames)

In [42]:
rating.to_parquet('data/ratings.parquet', index=False, compression='zstd')

Now all data needed for training and recommendation is now organized and saved.