In [2]:
import pandas as pd

user_ratings = pd.read_csv( '.\\user_ratings.csv',)
user_ratings['Username'] = user_ratings['Username'].astype('str')
user_ratings.sort_values(by=['Username'])

#remove duplicate ratings of a game by same user - keep latest occurence
user_ratings.drop_duplicates(subset=['Username', 'BGGId'], keep='last', inplace=True)

In [3]:
#inspect the low occurence of user ratings

threshold = 10

value_counts = user_ratings['Username'].value_counts()
print('Number of users that rated less than 10 games:', len(value_counts[value_counts < threshold]))
print('Number of ratings before filtering these values:', len(user_ratings))

#filter these values away
filtered = user_ratings[user_ratings['Username'].isin(value_counts.index[value_counts >= threshold])]
print('Number of ratings after filtering:', len(filtered))
print('Ratings deleted:', len(user_ratings) - len(filtered))


#inspect the low occurence of game ratings

value_counts = filtered['BGGId'].value_counts()
print('Number of games with less than 10 ratings:', len(value_counts[value_counts < threshold]))
print('Number of ratings before filtering these values:', len(filtered))

#filter these values away
filtered2 = filtered[filtered['BGGId'].isin(value_counts.index[value_counts >= threshold])]
print('Number of ratings after filtering:', len(filtered2))
print('Ratings deleted:', len(filtered) - len(filtered2))

Number of users that rated less than 10 games: 186817
Number of ratings before filtering these values: 18909528
Number of ratings after filtering: 18340336
Ratings deleted: 569192
Number of games with less than 10 ratings: 6
Number of ratings before filtering these values: 18340336
Number of ratings after filtering: 18340293
Ratings deleted: 43


In [4]:
#check again - some deletions may have lowered occurrences

value_counts = filtered2['Username'].value_counts()
print('Number of users that rated less than 10 games:', len(value_counts[value_counts < threshold]))
print('Number of ratings before filtering these values:', len(filtered2))

#filter these values away
filtered3 = filtered2[filtered2['Username'].isin(value_counts.index[value_counts >= threshold])]
print('Number of ratings after filtering:', len(filtered3))
print('Entries deleted:', len(filtered2) - len(filtered3))


#check again - some deletions may have lowered occurrences

value_counts = filtered3['BGGId'].value_counts()
print('Number of games with less than 10 ratings:', len(value_counts[value_counts < threshold]))
print('Number of ratings before filtering these values:', len(filtered))

#filter these values away
filtered4 = filtered3[filtered3['BGGId'].isin(value_counts.index[value_counts >= threshold])]
print('Number of ratings after filtering:', len(filtered4))
print('Entries deleted:', len(filtered3) - len(filtered4))

Number of users that rated less than 10 games: 1
Number of ratings before filtering these values: 18340293
Number of ratings after filtering: 18340284
Entries deleted: 9
Number of games with less than 10 ratings: 0
Number of ratings before filtering these values: 18340336
Number of ratings after filtering: 18340284
Entries deleted: 0


In [5]:
#save the result - preprocessing complete

#filtered4.to_csv('.\\user_ratings_cleaned.csv', index=False)

In [7]:
#filter deleted games away from all others files of dataset

print('Number of games before filtering:', user_ratings['Username'].nunique())
print('Number of games before filtering:', filtered4['Username'].nunique())



Number of games before filtering: 411375
Number of games before filtering: 224557


In [25]:
artists = pd.read_csv( '.\\themes.csv',)
orig_len = len(artists)
cond = artists['BGGId'].isin(filtered4['BGGId'])
cond = [not item for item in cond]
artists.drop(artists[cond].index, inplace = True)
print('Entries deleted:', orig_len - len(artists))
artists.to_csv('.\\themes_cleaned.csv', index=False)

Entries deleted: 6
