In [17]:
import pandas as pd

In [18]:
#read csv
df = pd.read_csv("./data/board_games.csv")

#use game_id as index
df.set_index('game_id', inplace=True)

#change years name to later unify dfs
df.rename(columns={'year_published': 'Year_of_Release'}, inplace=True)

#Split categories into columns
try:
    df[['principal_category', 'secondary_category','other_category']] = df['category'].str.split(',', expand=True)
except ValueError:
    # Handle the case where there are fewer than three categories
    df[['principal_category', 'secondary_category','other_category']] = df['category'].str.split(',', n=2, expand=True)

#delete columns
columns_to_drop= ["description","image","thumbnail","compilation","family","mechanic"]
df.drop(columns_to_drop, axis=1, inplace=True)

#manage null values --> set to 'unknown'
columns_null_values = ['artist', 'category','designer','publisher']
df[columns_null_values] = df[columns_null_values].apply(lambda columns: columns.apply(lambda x: 'unknown' if pd.isnull(x) else x))
df['expansion'] = df['expansion'].apply(lambda x: 'no' if pd.isnull(x) else 'yes')

#delete rows with many 0s
df.drop(df[(df['min_players']==0) | (df['min_playtime']==0)].index, inplace=True)

#drop duplicates (same 'name')
df.drop_duplicates(subset='name', inplace=True)


#Having the average rating and the number of users that have rated the game 
#we create a Weighted Average variable so we can compare in a more fair way game ratings

def calculate_weighted_average(row):
    avg_rating = row['average_rating']
    num_votes = row['users_rated']
    weighted_avg = (avg_rating * num_votes + base_rating * base_votes) / (num_votes + base_votes)
    return weighted_avg
df['WeightedAverage'] = df.apply(calculate_weighted_average, axis=1)


In [19]:
#read csv
df1 = pd.read_csv("./data/video_games.csv")

#manage null values
df1.dropna(subset=['Name'], inplace=True)
df1.dropna(subset=['Year_of_Release'], inplace=True)
#columns_null_values_df1 = ['Publisher', 'Developer','Rating','Critic_Score','Critic_Count','User_Score','User_Count']
#df1[columns_null_values_df1] = df1[columns_null_values_df1].apply(lambda columns: columns.apply(lambda x: 'unknown' if pd.isnull(x) else x))

#drop duplicates of name, platform and year of release
df1 = df1.drop_duplicates(subset=['Name', 'Platform', 'Year_of_Release'])

#change Year_of_release to int
df1['Year_of_Release'] = df1['Year_of_Release'].astype('int64')
base_rating = df['average_rating'].mean()
base_votes = 500


In [20]:
df.to_csv('./data/cleaned_board_games.csv', encoding='utf-8', index=False)

In [21]:
df1.to_csv('./data/cleaned_video_games.csv', encoding='utf-8', index=False)