In [1]:
import os
from app.utils.universal_path import universal_path

data_dir = universal_path('../../data')
output_dir = universal_path('../../data/cleaned')

In [2]:
import pandas as pd

user_ratings = pd.read_csv(os.path.join(data_dir, 'user_ratings.csv'))
user_ratings

Unnamed: 0,BGGId,Rating,Username
0,213788,8.0,Tonydorrf
1,213788,8.0,tachyon14k
2,213788,8.0,Ungotter
3,213788,8.0,brainlocki3
4,213788,8.0,PPMP
...,...,...,...
18942210,165521,3.0,rseater
18942211,165521,3.0,Bluefox86
18942212,165521,3.0,serginator
18942213,193488,1.0,CaptainCattan


In [3]:
print(f'Number of user ratings: {len(user_ratings)}')

Number of user ratings: 18942215


In [4]:
# remove NaNs
user_ratings = user_ratings.dropna(subset=['Username'])
num_user_ratings = len(user_ratings)
print(f'Number of user ratings after deleting NaNs: {num_user_ratings}')

user_ratings['Username'] = user_ratings['Username'].astype('str')

# remove duplicate ratings of a game by the same user - keep only the latest occurrence
user_ratings = user_ratings.drop_duplicates(subset=['Username', 'BGGId'], keep='last')

num_user_ratings_after_deleting_duplicates = len(user_ratings)
num_deleted = num_user_ratings - num_user_ratings_after_deleting_duplicates
deleted_percent = num_deleted / num_user_ratings * 100
print(f'Number of user ratings after deleting duplicates: {num_user_ratings_after_deleting_duplicates} ({num_deleted} deleted ({deleted_percent:.2f}%))')

Number of user ratings after deleting NaNs: 18942152


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_ratings['Username'] = user_ratings['Username'].astype('str')


Number of user ratings after deleting duplicates: 18909465 (32687 deleted (0.17%))


In [5]:
# inspect the low occurrence of user ratings

threshold = 10
iterations = 2

for i in range(iterations):
    print(f'Iteration {i+1}')
    
    num_user_ratings_before = len(user_ratings)
    print(f'\tNumber of user ratings: {num_user_ratings_before}')

    num_ratings_per_user = user_ratings['Username'].value_counts()
    users_to_delete = num_ratings_per_user[num_ratings_per_user < threshold].index
    print(f'\tNumber of users who rated less than {threshold} games: {len(users_to_delete)}')
    
    user_ratings = user_ratings[~user_ratings['Username'].isin(users_to_delete)]
    num_user_ratings_after_deleting_users = len(user_ratings)
    print(f'\tNumber of user ratings after deleting users with low number of ratings: {num_user_ratings_after_deleting_users}')
    print(f'\tNumber of user ratings deleted: {num_user_ratings_before - num_user_ratings_after_deleting_users}')

    num_ratings_per_game = user_ratings['BGGId'].value_counts()
    games_to_delete = num_ratings_per_game[num_ratings_per_game < threshold].index
    print(f'\tNumber of games with less than {threshold} ratings: {len(games_to_delete)}')
    
    user_ratings = user_ratings[~user_ratings['BGGId'].isin(games_to_delete)]
    num_user_ratings_after_deleting_games = len(user_ratings)
    print(f'\tNumber of user ratings after deleting games with low number of ratings: {num_user_ratings_after_deleting_games}')
    print(f'\tNumber of user ratings deleted: {num_user_ratings_after_deleting_users - num_user_ratings_after_deleting_games}')

Iteration 1
	Number of user ratings: 18909465
	Number of users who rated less than 10 games: 186817
	Number of user ratings after deleting users with low number of ratings: 18340273
	Number of user ratings deleted: 569192
	Number of games with less than 10 ratings: 6
	Number of user ratings after deleting games with low number of ratings: 18340230
	Number of user ratings deleted: 43
Iteration 2
	Number of user ratings: 18340230
	Number of users who rated less than 10 games: 1
	Number of user ratings after deleting users with low number of ratings: 18340221
	Number of user ratings deleted: 9
	Number of games with less than 10 ratings: 0
	Number of user ratings after deleting games with low number of ratings: 18340221
	Number of user ratings deleted: 0


In [6]:
# save the result - preprocessing complete
user_ratings.to_csv(os.path.join(output_dir, 'user_ratings.csv'), index=False)

In [7]:
# filter deleted games away from all others files of dataset

for filename in os.listdir(data_dir):
    if filename.endswith('.csv') and filename != 'user_ratings.csv':
        df = pd.read_csv(os.path.join(data_dir, filename))
        orig_len = len(df)
        df = df[df['BGGId'].isin(user_ratings['BGGId'])]
        print(f'Number of entries deleted from {filename}: {orig_len - len(df)}')
        df.to_csv(os.path.join(output_dir, filename), index=False)


Number of entries deleted from publishers_reduced.csv: 6
Number of entries deleted from mechanics.csv: 6
Number of entries deleted from artists_reduced.csv: 6
Number of entries deleted from ratings_distribution.csv: 6
Number of entries deleted from games.csv: 6
Number of entries deleted from designers_reduced.csv: 6
Number of entries deleted from themes.csv: 6
Number of entries deleted from subcategories.csv: 6
