In [1]:
# Import the requests library
import pandas as pd
import re

# Steam_store_data Load, transform, clean

In [2]:
#### Read in and preview Gamespot Reviews
steam_store_raw_df = pd.read_csv('original_datasets/steam.csv')
print(steam_store_raw_df.shape)
steam_store_raw_df.head()

(27075, 18)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [3]:
# Special character in 'categories	' found in column header
steam_store_raw_df.columns=steam_store_raw_df.columns.str.replace('	','')

# Drop the 'categories', 'genres', and 'steamspy_tags' columns. 
# We have a more detailed account of the same information in the 'steamspy_tags' column
steam_store_raw_df = steam_store_raw_df.drop(['categories','genres','steamspy_tags','platforms'], axis=1)

steam_store_raw_df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,0,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,0,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,0,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,0,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,0,0,5250,288,624,415,5000000-10000000,3.99


In [4]:
# Make all names lower case
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.lower()
# Remove all special characters
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'[^\w ]+', '')
# Remove all duplicate whitespace
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.strip(r' ')

In [None]:
rom_to_int_dict = {r' ii$': ' 2',
                  r' iii$': ' 3',
                  r' iii$': ' 4',
                  r' v$': ' 5',
                  r' vi$': ' 6',
                  r' vii$': ' 7',
                  r' viii$': ' 8',
                  r' ix$': ' 9',
                  r' x$': ' 10',
                  r' xi$': '11',
                  r' xii$': '12',
                  r' xiii$': '13',
                  r' xiv$': '14'}

In [None]:
# Replace all keys from the replace_dict with their values within the dataframe
steam_store_raw_df['name'] = steam_store_raw_df['name'].replace(rom_to_int_dict, regex=True)

In [None]:
# Create a list of values that block matching between steam and twitch for removal
remove_list = (' hd', ' remastered', ' directors cut edition', ' game of the year enhanced', 
               ' aot wings of freedom', ' aot 2', ' remastered', ' expansion', ' hd edition',
              ' board game with online multiplayer', ' gold edition', ' game of the year edition',' creative survival',
              ' ultimate edition',' complete edition',' special edition',' enhanced edition',' definitive edition',
              ' windows edition',' redux',' classic hd',' millennium edition',' deluxe edition',' steam edition',
              ' directors cut', ' the telltale series', ' a realm reborn', ' night of the raven', ' a telltale game series',
              '00db', ' bfg edition', ' sol', ' guacamelee super turbo championship edition', ' lightspeed edition',
              ' the complete edition', ' a criterion game', ' ultimate hd edition', ' hd remastered',
              ' warlord edition', ' arcade edition', 'beyond the sword')

In [None]:
# Remove all unwanted values from the remove_list
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace('|'.join(remove_list), '')

In [None]:
# Create a dictionary of values that block matching between steam and twitch for replacement
replace_dict = {'catherine classic': 'catherine', 
                'creativedestruction': 'creative destruction', 
                'eternal return: black survival': 'eternal return',
                'heroes of might and magic 3 the shadow of death': 'heroes of might and magic 3',
                'orcs must die unchained': 'orcs must die',
                'puyo puyotetris': 'puyo puyo tetris',
                'life is strange episode 1': 'life is strange',
                'resident evil 4 biohazard 4': 'resident evil 4',
                'resident evil 2 biohazard re2': 'resident evil 2',
                'resident evil 6 biohazard 6': 'resident evil 6',
                'resident evil 7 biohazard 7': 'resident evil 7 biohazard',
                'rollercoaster tycoon 2 triple thrill pack': 'rollercoaster tycoon 2',
                'romance of the three kingdoms 13 13': 'romance of the three kingdoms 13',
                'shadowverse ccg': 'shadowverse',
                'soulworker anime action mmo': 'soulworker',
                'super seducer how to talk': 'super seducer',
                'the bards tale 5 barrows deep': 'the bards tale 5',
                'puyo puyotetris': 'puyo puyo tetris',
                'x x2 remastered': 'xx2 remastered',
                'hitman 2 silent assassin': 'hitman 2',
                'hitman episode 3 marrakesh': 'hitman 3',
                'la noire the': 'la noire',
                'life is strange before the storm episode 1 awake': 'life is strange before the storm',
               'microsoft flight simulator 2004 a century of flight': 'microsoft flight simulator',
               'sid meiers civilization vi rise fall': 'sid meiers civilization vi rise and fall'}

In [None]:
# Replace all keys from the replace_dict with their values within the dataframe
steam_store_raw_df['name'] = steam_store_raw_df['name'].replace(replace_dict, regex=True)

In [None]:
# Select duplicate rows in the 'name' column
dup_rows_steam = steam_store_raw_df[steam_store_raw_df.duplicated(['name'])]
dup_rows_steam = dup_rows_steam.sort_values(by=['name'])
dup_rows_steam.head()

In [None]:
# Drop duplicates (only duplicates above are unnamed games)
steam_store_raw_df.drop_duplicates(subset ="name", inplace = True)
print(steam_store_raw_df.shape)

In [None]:
# Add prefix for easy ID downstream
steam_store_raw_df = steam_store_raw_df.add_prefix('st_')

# twitch_raw_df Load, transform, clean

In [None]:
#### Read in and preview twitch data
twitch_raw_df = pd.read_csv('original_datasets/Twitch_game_data.csv')
print(twitch_raw_df.shape)

In [None]:
# Make all names lower case
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.lower()
# Remove all special characters
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\x00-\x7f]', '')

In [None]:
# Make all names lower case
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.lower()
# Remove all special characters
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\w ]+', '')

# Remove all duplicate whitespace
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.strip(r' ')

# # Convert roman numerals end sentence
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' ii$', ' 2')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' iii$', ' 3')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' iv$', ' 4')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' v$', ' 5')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' vi$', ' 6')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' vii$', ' 7')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' viii$', ' 8')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' ix$', ' 9')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' x$', ' 10')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xi$', '11')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xii$', '12')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xiii$', '13')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xiv$', '14')


# Remove all unwanted values from the remove_list
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace('|'.join(remove_list), '')

# Replace all keys from the replace_dict with their values within the dataframe
twitch_raw_df['Game'] = twitch_raw_df['Game'].replace(replace_dict, regex=True)

In [None]:
# Add prefix for easy ID downstream
twitch_raw_df = twitch_raw_df.add_prefix('tw_')

# metacritic_raw_df Load, transform, clean

In [None]:
# Read in and preview metacritic data
metacritic_raw_df = pd.read_csv('original_datasets/metacritic_game_info.csv')
print(metacritic_raw_df.shape)
metacritic_raw_df.head()

In [None]:
# Only keep PC game review records
metacritic_raw_df = metacritic_raw_df[metacritic_raw_df.Platform == 'PC']
print(metacritic_raw_df.shape)

In [None]:
# Make all names lower case
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.lower()
# Remove all special characters
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r'[^\w ]+', '')

# Remove all duplicate whitespace
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.strip(r' ')

# # Convert roman numerals end sentence
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' ii$', ' 2')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' iii$', ' 3')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' iv$', ' 4')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' v$', ' 5')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' vi$', ' 6')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' vii$', ' 7')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' viii$', ' 8')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' ix$', ' 9')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' x$', ' 10')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xi$', '11')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xii$', '12')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xiii$', '13')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xiv$', '14')


# Remove all unwanted values from the remove_list
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace('|'.join(remove_list), '')

# Replace all keys from the replace_dict with their values within the dataframe
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].replace(replace_dict, regex=True)

In [None]:
# Remove Redundant columns
metacritic_raw_df = metacritic_raw_df.drop(['Publisher','Genre','Platform','No_Players', 'Unnamed: 0'], axis=1)
metacritic_raw_df.head()

In [None]:
# Add prefix for easy ID downstream
metacritic_raw_df = metacritic_raw_df.add_prefix('meta_')

# steamspy_tag Load, transform, clean

In [None]:
# Read in and preview steamspy tag info
steamspy_tag_raw_df = pd.read_csv('original_datasets/steamspy_tag_data.csv')
print(steamspy_tag_raw_df.shape)
# Add prefix for easy ID downstream
steamspy_tag_raw_df = steamspy_tag_raw_df.add_prefix('st_spy_')

In [None]:
# Standardize string merge column to 'Game'
steam_store_raw_df.rename(columns = {'st_name':'Game'}, inplace=True)
metacritic_raw_df.rename(columns = {'meta_Title':'Game'}, inplace=True)
twitch_raw_df.rename(columns = {'tw_Game':'Game'}, inplace=True)

# FINAL MERGE

In [None]:
# Create list of dfs that will join on Game str
str_join_dfs = [steam_store_raw_df, metacritic_raw_df, twitch_raw_df]

In [None]:
# String merge tables join
from functools import reduce
str_merge_df = reduce(lambda left,right: pd.merge(left,right,on='Game'),str_join_dfs)

In [None]:
print(str_merge_df.shape)

In [None]:
final_df = pd.merge(left=str_merge_df, right=steamspy_tag_raw_df, left_on='st_appid', right_on='st_spy_appid')

In [None]:
print(final_df.shape)
final_df.head()

In [None]:
final_column_list = final_df.columns.values.tolist()
final_column_list

In [None]:
# Find all columns with only 0 values for removal
zero_only_df = final_df.loc[:, (final_df == 0).all()]
zero_only_list = zero_only_df.columns.values.tolist()

In [None]:
print(zero_only_df.shape)

In [None]:
# # Remove Redundant columns
final_df = final_df.drop(zero_only_list, axis=1)
print(final_df.shape)
final_df.head()

In [None]:
final_df.to_csv('for_ml_model.csv')