In [1]:
# Import the requests library
import pandas as pd
import re

# Steam_store_data Load, transform, clean

In [2]:
#### Read in and preview Gamespot Reviews
steam_store_raw_df = pd.read_csv('original_datasets/steam.csv')
print(steam_store_raw_df.shape)
steam_store_raw_df.head()

(27075, 18)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [3]:
# Special character in 'categories	' found in column header
steam_store_raw_df.columns=steam_store_raw_df.columns.str.replace('	','')

# Drop the 'categories', 'genres', and 'steamspy_tags' columns. 
# We have a more detailed account of the same information in the 'steamspy_tags' column
steam_store_raw_df = steam_store_raw_df.drop(['categories','genres','steamspy_tags','platforms'], axis=1)

steam_store_raw_df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,0,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,0,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,0,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,0,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,0,0,5250,288,624,415,5000000-10000000,3.99


In [4]:
# Make all names lower case
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.lower()
# Remove all special characters
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'[^\w ]+', '')
# Remove all duplicate whitespace
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.strip(r' ')

In [5]:
# Roman numeral dict for replacement
rom_to_int_dict = {r' ii$': ' 2',
                  r' iii$': ' 3',
                  r' iii$': ' 4',
                  r' v$': ' 5',
                  r' vi$': ' 6',
                  r' vii$': ' 7',
                  r' viii$': ' 8',
                  r' ix$': ' 9',
                  r' x$': ' 10',
                  r' xi$': '11',
                  r' xii$': '12',
                  r' xiii$': '13',
                  r' xiv$': '14'}

In [6]:
# Roman numeral to integer conversion
steam_store_raw_df['name'] = steam_store_raw_df['name'].replace(rom_to_int_dict, regex=True)

In [7]:
# Create a list of values that block matching between steam and twitch for removal
remove_list = (' hd', ' remastered', ' directors cut edition', ' game of the year enhanced', 
               ' aot wings of freedom', ' aot 2', ' remastered', ' expansion', ' hd edition',
              ' board game with online multiplayer', ' gold edition', ' game of the year edition',' creative survival',
              ' ultimate edition',' complete edition',' special edition',' enhanced edition',' definitive edition',
              ' windows edition',' redux',' classic hd',' millennium edition',' deluxe edition',' steam edition',
              ' directors cut', ' the telltale series', ' a realm reborn', ' night of the raven', ' a telltale game series',
              '00db', ' bfg edition', ' sol', ' guacamelee super turbo championship edition', ' lightspeed edition',
              ' the complete edition', ' a criterion game', ' ultimate hd edition', ' hd remastered',
              ' warlord edition', ' arcade edition')

In [8]:
# Remove all unwanted values from the remove_list
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace('|'.join(remove_list), '')

In [9]:
# Create a dictionary of values that block matching between steam and twitch for replacement
replace_dict = {'catherine classic': 'catherine', 
                'creativedestruction': 'creative destruction', 
                'eternal return: black survival': 'eternal return',
                'heroes of might and magic 3 the shadow of death': 'heroes of might and magic 3',
                'orcs must die unchained': 'orcs must die',
                'puyo puyotetris': 'puyo puyo tetris',
                'life is strange episode 1': 'life is strange',
                'resident evil 4 biohazard 4': 'resident evil 4',
                'resident evil 2 biohazard re2': 'resident evil 2',
                'resident evil 6 biohazard 6': 'resident evil 6',
                'resident evil 7 biohazard 7': 'resident evil 7 biohazard',
                'rollercoaster tycoon 2 triple thrill pack': 'rollercoaster tycoon 2',
                'romance of the three kingdoms 13 13': 'romance of the three kingdoms 13',
                'shadowverse ccg': 'shadowverse',
                'soulworker anime action mmo': 'soulworker',
                'super seducer how to talk': 'super seducer',
                'the bards tale 5 barrows deep': 'the bards tale 5',
                'puyo puyotetris': 'puyo puyo tetris',
                'x x2 remastered': 'xx2 remastered',
                'hitman 2 silent assassin': 'hitman 2',
                'hitman episode 3 marrakesh': 'hitman 3',
                'la noire the': 'la noire',
                'life is strange before the storm episode 1 awake': 'life is strange before the storm',
               'microsoft flight simulator 2004 a century of flight': 'microsoft flight simulator',
               'sid meiers civilization vi rise fall': 'sid meiers civilization vi rise and fall'}

In [10]:
# Replace all keys from the replace_dict with their values within the dataframe
steam_store_raw_df['name'] = steam_store_raw_df['name'].replace(replace_dict, regex=True)

In [11]:
# Select duplicate rows in the 'name' column
dup_rows_steam = steam_store_raw_df[steam_store_raw_df.duplicated(['name'])]
dup_rows_steam = dup_rows_steam.sort_values(by=['name'])
dup_rows_steam.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
22155,855670,,2018-06-01,1,Wu Zhang,Wu Zhang,0,0,2,2,0,0,0-20000,2.09
21395,832520,,2018-07-30,0,Winturn Games,Winturn Games,0,0,15,5,0,0,0-20000,2.09
25409,970530,,2018-11-27,0,DoubleC Games,DoubleC Games,0,0,11,2,0,0,0-20000,1.69
21123,822470,,2018-09-25,0,Smm,Smm,0,0,13,3,0,0,0-20000,0.79
25475,973560,,2018-11-23,0,且御龙行,且御龙行,0,0,4,4,0,0,0-20000,15.49


In [12]:
# Drop duplicates (only duplicates above are unnamed games)
steam_store_raw_df.drop_duplicates(subset ="name", inplace = True)
print(steam_store_raw_df.shape)

(26859, 14)


In [13]:
# Add prefix for easy ID downstream
steam_store_raw_df = steam_store_raw_df.add_prefix('st_')

# twitch_raw_df Load, transform, clean

In [14]:
#### Read in and preview twitch data
twitch_raw_df = pd.read_csv('original_datasets/Twitch_game_data.csv')
print(twitch_raw_df.shape)

(12750, 12)


In [15]:
# Make all names lower case
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.lower()
# Remove all special characters
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\x00-\x7f]', '')

In [16]:
# Make all names lower case
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.lower()
# Remove all special characters
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\w ]+', '')

# Remove all duplicate whitespace
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.strip(r' ')

# Roman numeral to integer conversion
twitch_raw_df['Game'] = twitch_raw_df['Game'].replace(rom_to_int_dict, regex=True)
# Remove all unwanted values from the remove_list
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace('|'.join(remove_list), '')
# Replace all keys from the replace_dict with their values within the dataframe
twitch_raw_df['Game'] = twitch_raw_df['Game'].replace(replace_dict, regex=True)

In [17]:
# Add prefix for easy ID downstream
twitch_raw_df = twitch_raw_df.add_prefix('tw_')

# steamspy_tag Load, transform, clean

In [18]:
# Read in and preview steamspy tag info
steamspy_tag_raw_df = pd.read_csv('original_datasets/steamspy_tag_data.csv')
print(steamspy_tag_raw_df.shape)
# Add prefix for easy ID downstream
steamspy_tag_raw_df = steamspy_tag_raw_df.add_prefix('st_spy_')

(29022, 372)


In [19]:
steamspy_tag_raw_df.head()

Unnamed: 0,st_spy_appid,st_spy_1980s,st_spy_1990s,st_spy_2.5d,st_spy_2d,st_spy_2d_fighter,st_spy_360_video,st_spy_3d,st_spy_3d_platformer,st_spy_3d_vision,...,st_spy_warhammer_40k,st_spy_web_publishing,st_spy_werewolves,st_spy_western,st_spy_word_game,st_spy_world_war_i,st_spy_world_war_ii,st_spy_wrestling,st_spy_zombies,st_spy_e_sports
0,10,144,564,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,550
1,20,0,71,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,122,0,0,0
3,40,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50,0,77,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Standardize string merge column to 'Game'
steam_store_raw_df.rename(columns = {'st_name':'Game'}, inplace=True)
twitch_raw_df.rename(columns = {'tw_Game':'Game'}, inplace=True)

In [21]:
# Standardize string merge column to 'Game'
steam_store_raw_df.rename(columns = {'st_appid':'appid'}, inplace=True)
steamspy_tag_raw_df.rename(columns = {'st_spy_appid':'appid'}, inplace=True)

# FINAL MERGE

In [22]:
# Create list of dfs that will join on Game str
steam_dfs = [steam_store_raw_df, steamspy_tag_raw_df]

In [23]:
# String merge tables join
from functools import reduce
#steam_merge_df = reduce(lambda left,right: pd.merge(left,right,on='appid'),str_join_dfs)
steam_merge_df = pd.merge(left=steam_store_raw_df, right=steamspy_tag_raw_df, on='appid')

In [24]:
print(steam_merge_df.shape)

(26859, 385)


In [25]:
print(steam_merge_df.shape)
steam_merge_df.head()

(26859, 385)


Unnamed: 0,appid,Game,st_release_date,st_english,st_developer,st_publisher,st_required_age,st_achievements,st_positive_ratings,st_negative_ratings,...,st_spy_warhammer_40k,st_spy_web_publishing,st_spy_werewolves,st_spy_western,st_spy_word_game,st_spy_world_war_i,st_spy_world_war_ii,st_spy_wrestling,st_spy_zombies,st_spy_e_sports
0,10,counterstrike,2000-11-01,1,Valve,Valve,0,0,124534,3339,...,0,0,0,0,0,0,0,0,0,550
1,20,team fortress classic,1999-04-01,1,Valve,Valve,0,0,3318,633,...,0,0,0,0,0,0,0,0,0,0
2,30,day of defeat,2003-05-01,1,Valve,Valve,0,0,3416,398,...,0,0,0,0,0,5,122,0,0,0
3,40,deathmatch classic,2001-06-01,1,Valve,Valve,0,0,1273,267,...,0,0,0,0,0,0,0,0,0,0
4,50,halflife opposing force,1999-11-01,1,Gearbox Software,Valve,0,0,5250,288,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#put all column names to a list for future for loop use to go thru each of the columns in the list
column_names_list = steam_merge_df.columns.to_list()

In [31]:
#create an empty list to have the columns added to it (for dropping later)
li = []

#create a function to add in columns that's meeting a certain criteria
def columns_ready_to_drop(col_name):
    #if column value type is int, continue the following, if not, ignore.
    if steam_merge_df[col_name].dtypes == "int64":

        #if dataframe col_name has 0 values more than 80% of the number of rows, save in a list, and ready to drop
        non_zero_rows = steam_merge_df[col_name].apply(lambda r: True if r > 0 else False)
        numberOfRows = len(non_zero_rows[non_zero_rows == True].index)
        #testing
        #print(numberOfRows)
        #When the percentage below is increasing, it means the column has to have less zero rows to be able to remain in the new database. 
        # 0.2 means if non-zero rows are only 20% or less of the total rows, the column will be add to the drop list.
        if numberOfRows <= 0.20 *len(steam_merge_df):
            li.append(col_name)
        else:
            pass
    else:
        pass
    return li

In [32]:
# use the function to run thru all columns
for col in column_names_list:
    columns_ready_to_drop(col)
    
print(li)

['st_required_age', 'st_spy_1980s', 'st_spy_1990s', 'st_spy_2.5d', 'st_spy_2d', 'st_spy_2d_fighter', 'st_spy_360_video', 'st_spy_3d', 'st_spy_3d_platformer', 'st_spy_3d_vision', 'st_spy_4_player_local', 'st_spy_4x', 'st_spy_6dof', 'st_spy_atv', 'st_spy_abstract', 'st_spy_action_rpg', 'st_spy_action_adventure', 'st_spy_addictive', 'st_spy_agriculture', 'st_spy_aliens', 'st_spy_alternate_history', 'st_spy_america', 'st_spy_animation_&_modeling', 'st_spy_anime', 'st_spy_arcade', 'st_spy_arena_shooter', 'st_spy_artificial_intelligence', 'st_spy_assassin', 'st_spy_asynchronous_multiplayer', 'st_spy_atmospheric', 'st_spy_audio_production', 'st_spy_bmx', 'st_spy_base_building', 'st_spy_baseball', 'st_spy_based_on_a_novel', 'st_spy_basketball', 'st_spy_batman', 'st_spy_battle_royale', 'st_spy_beat_em_up', 'st_spy_beautiful', 'st_spy_benchmark', 'st_spy_bikes', 'st_spy_blood', 'st_spy_board_game', 'st_spy_bowling', 'st_spy_building', 'st_spy_bullet_hell', 'st_spy_bullet_time', 'st_spy_crpg', 's

In [33]:
#dropping columns when the dtypes of the columns is int64 and there are more than 80% of values being 0.
# tried 80%, 50%, 45% non-zero rows, but it's dropping too many columns, so chose 20% as the cut off for now, this can be changed.
steam_merge_df = steam_merge_df.drop(columns=li)
steam_merge_df

Unnamed: 0,appid,Game,st_release_date,st_english,st_developer,st_publisher,st_achievements,st_positive_ratings,st_negative_ratings,st_average_playtime,st_median_playtime,st_owners,st_price,st_spy_action,st_spy_adventure,st_spy_casual,st_spy_indie,st_spy_simulation,st_spy_singleplayer,st_spy_strategy
0,10,counterstrike,2000-11-01,1,Valve,Valve,0,124534,3339,17612,317,10000000-20000000,7.19,2681,0,0,0,0,0,329
1,20,team fortress classic,1999-04-01,1,Valve,Valve,0,3318,633,277,62,5000000-10000000,3.99,208,15,0,0,0,0,0
2,30,day of defeat,2003-05-01,1,Valve,Valve,0,3416,398,187,34,5000000-10000000,3.99,99,0,0,0,0,16,0
3,40,deathmatch classic,2001-06-01,1,Valve,Valve,0,1273,267,258,184,5000000-10000000,3.99,85,0,0,0,0,0,0
4,50,halflife opposing force,1999-11-01,1,Gearbox Software,Valve,0,5250,288,624,415,5000000-10000000,3.99,211,87,0,0,0,148,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26854,1065230,room of pandora,2019-04-24,1,SHEN JIAWEI,SHEN JIAWEI,7,3,0,0,0,0-20000,2.09,0,21,21,21,0,0,0
26855,1065570,cyber gun,2019-04-23,1,Semyon Maximov,BekkerDev Studio,0,8,1,0,0,0-20000,1.69,21,20,0,21,0,0,0
26856,1065650,super star blast,2019-04-24,1,EntwicklerX,EntwicklerX,24,0,1,0,0,0-20000,3.99,21,0,21,21,0,0,0
26857,1066700,new yankee 7 deer hunters,2019-04-17,1,Yustas Game Studio,Alawar Entertainment,0,2,0,0,0,0-20000,5.19,0,20,21,21,0,0,0


In [34]:
print(steam_merge_df.shape)
steam_merge_df.to_csv('for_ml_model.csv')

(26859, 20)
