In [1]:
# Import the requests library
import pandas as pd
import re

# Steam_store_data Load, transform, clean

In [2]:
#### Read in and preview Gamespot Reviews
steam_store_raw_df = pd.read_csv('original_datasets/steam.csv')
print(steam_store_raw_df.shape)
steam_store_raw_df.head()

(27075, 18)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [3]:
# Count number of unique reviews
len(steam_store_raw_df.name.unique())


27033

In [4]:
# Special character in 'categories	' found in column header
steam_store_raw_df.columns=steam_store_raw_df.columns.str.replace('	','')

# Drop the 'categories', 'genres', and 'steamspy_tags' columns. 
# We have a more detailed account of the same information in the 'steamspy_tags' column
steam_store_raw_df = steam_store_raw_df.drop(['categories','genres','steamspy_tags'], axis=1)
steam_store_raw_df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,0,5250,288,624,415,5000000-10000000,3.99


In [5]:
# Make all names lower case
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.lower()
# Remove all special characters
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'[^\w ]+', '')
# Remove all duplicate whitespace
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.strip(r' ')

steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' ii$', ' 2')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' iii$', ' 3')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' iii$', ' 4')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' v$', ' 5')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' vi$', ' 6')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' vii$', ' 7')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' viii$', ' 8')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' ix$', ' 9')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' x$', ' 10')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' xi$', '11')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' xii$', '12')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' xiii$', '13')
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace(r' xiv$', '14')

In [6]:
# Create a list of values that block matching between steam and twitch for removal
remove_list = (' hd', ' remastered', ' directors cut edition', ' game of the year enhanced', 
               ' aot wings of freedom', ' aot 2', ' remastered', ' expansion', ' hd edition',
              ' board game with online multiplayer', ' gold edition', ' game of the year edition',' creative survival',
              ' ultimate edition',' complete edition',' special edition',' enhanced edition',' definitive edition',
              ' windows edition',' redux',' classic hd',' millennium edition',' deluxe edition',' steam edition',
              ' directors cut', ' the telltale series', ' a realm reborn', ' night of the raven', ' a telltale game series',
              '00db', ' bfg edition', ' sol', ' guacamelee super turbo championship edition', ' lightspeed edition',
              ' the complete edition', ' a criterion game', ' ultimate hd edition', ' hd remastered',
              ' warlord edition', ' arcade edition')

In [7]:
# Remove all unwanted values from the remove_list
steam_store_raw_df['name'] = steam_store_raw_df['name'].str.replace('|'.join(remove_list), '')

In [8]:
# Create a dictionary of values that block matching between steam and twitch for replacement
replace_dict = {'catherine classic': 'catherine', 
                'creativedestruction': 'creative destruction', 
                'eternal return: black survival': 'eternal return',
                'heroes of might and magic 3 the shadow of death': 'heroes of might and magic 3',
                'orcs must die unchained': 'orcs must die',
                'puyo puyotetris': 'puyo puyo tetris',
                'life is strange episode 1': 'life is strange',
                'resident evil 4 biohazard 4': 'resident evil 4',
                'resident evil 2 biohazard re2': 'resident evil 2',
                'resident evil 6 biohazard 6': 'resident evil 6',
                'resident evil 7 biohazard 7': 'resident evil 7 biohazard',
                'rollercoaster tycoon 2 triple thrill pack': 'rollercoaster tycoon 2',
                'romance of the three kingdoms 13 13': 'romance of the three kingdoms 13',
                'shadowverse ccg': 'shadowverse',
                'soulworker anime action mmo': 'soulworker',
                'super seducer how to talk': 'super seducer',
                'the bards tale 5 barrows deep': 'the bards tale 5',
                'puyo puyotetris': 'puyo puyo tetris',
                'x x2 remastered': 'xx2 remastered',
                'hitman 2 silent assassin': 'hitman 2',
                'hitman episode 3 marrakesh': 'hitman 3',
                'la noire the': 'la noire',
                'life is strange before the storm episode 1 awake': 'life is strange before the storm',
               'microsoft flight simulator 2004 a century of flight': 'microsoft flight simulator',
               'sid meiers civilization vi rise fall': 'sid meiers civilization vi rise and fall'}

In [9]:
# Replace all keys from the replace_dict with their values within the dataframe
steam_store_raw_df['name'] = steam_store_raw_df['name'].replace(replace_dict, regex=True)

In [10]:
# If Contains THEN replace with (when i run this it replaces every value in the name column...thoughts on how to correct?
# I want this code to see that 'gothic 2' is contained in a string and, if it is, to replace the entire string with 'gothic 2')

steam_store_raw_df.name = steam_store_raw_df.name.apply(lambda x: 'gothic 2' if 'gothic 2' in x else x)

# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*gothic 2.*$)', 'gothic 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*shoppe keep 2.*$)', 'shoppe keep 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*super seducer 2.*$)', 'super seducer 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*ys 8 lacrimosa of dana.*$)', 'ys 8 lacrimosa of dana')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*age of empires ii.*$)', 'age of empires ii')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*bionic commando.*$)', 'bionic commando')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*blazblue.*$)', 'blazblue')

In [11]:
steam_store_raw_df

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,counterstrike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,team fortress classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,0,3318,633,277,62,5000000-10000000,3.99
2,30,day of defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,0,3416,398,187,34,5000000-10000000,3.99
3,40,deathmatch classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,0,1273,267,258,184,5000000-10000000,3.99
4,50,halflife opposing force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,0,5250,288,624,415,5000000-10000000,3.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,room of pandora,2019-04-24,1,SHEN JIAWEI,SHEN JIAWEI,windows,0,7,3,0,0,0,0-20000,2.09
27071,1065570,cyber gun,2019-04-23,1,Semyon Maximov,BekkerDev Studio,windows,0,0,8,1,0,0,0-20000,1.69
27072,1065650,super star blast,2019-04-24,1,EntwicklerX,EntwicklerX,windows,0,24,0,1,0,0,0-20000,3.99
27073,1066700,new yankee 7 deer hunters,2019-04-17,1,Yustas Game Studio,Alawar Entertainment,windows;mac,0,0,2,0,0,0,0-20000,5.19


In [12]:
# Remove Redundant columns

In [13]:
# Select duplicate rows in the 'name' column
dup_rows_steam = steam_store_raw_df[steam_store_raw_df.duplicated(['name'])]
dup_rows_steam = dup_rows_steam.sort_values(by=['name'])
dup_rows_steam.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
22155,855670,,2018-06-01,1,Wu Zhang,Wu Zhang,windows,0,0,2,2,0,0,0-20000,2.09
21395,832520,,2018-07-30,0,Winturn Games,Winturn Games,windows,0,0,15,5,0,0,0-20000,2.09
25409,970530,,2018-11-27,0,DoubleC Games,DoubleC Games,windows,0,0,11,2,0,0,0-20000,1.69
21123,822470,,2018-09-25,0,Smm,Smm,windows,0,0,13,3,0,0,0-20000,0.79
25475,973560,,2018-11-23,0,且御龙行,且御龙行,windows;mac,0,0,4,4,0,0,0-20000,15.49


In [14]:
# Drop duplicates (only duplicates above are unnamed games)
steam_store_raw_df.drop_duplicates(subset ="name", inplace = True)
print(steam_store_raw_df.shape)

(26859, 15)


In [15]:
my_list = steam_store_raw_df.columns.values.tolist()
my_list

['appid',
 'name',
 'release_date',
 'english',
 'developer',
 'publisher',
 'platforms',
 'required_age',
 'achievements',
 'positive_ratings',
 'negative_ratings',
 'average_playtime',
 'median_playtime',
 'owners',
 'price']

In [16]:
my_list = steam_store_raw_df.columns.values.tolist()
my_list

['appid',
 'name',
 'release_date',
 'english',
 'developer',
 'publisher',
 'platforms',
 'required_age',
 'achievements',
 'positive_ratings',
 'negative_ratings',
 'average_playtime',
 'median_playtime',
 'owners',
 'price']

# twitch_raw_df Load, transform, clean

In [17]:
#### Read in and preview twitch data
twitch_raw_df = pd.read_csv('original_datasets/Twitch_game_data.csv')
print(twitch_raw_df.shape)

(12750, 12)


In [18]:
# Make all names lower case
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.lower()
# Remove all special characters
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\x00-\x7f]', '')

In [19]:
# Drop duplicates (dropping duplictes. Duplicated games in this set are minimal in number and marginal games)
twitch_raw_df.drop_duplicates(subset ="Game", keep='last', inplace = True)
print(twitch_raw_df.shape)

(1638, 12)


In [20]:
# Get the data types of each column.
twitch_raw_df.dtypes

Rank                  int64
Game                 object
Month                 int64
Year                  int64
Hours_watched         int64
Hours_Streamed       object
Peak_viewers          int64
Peak_channels         int64
Streamers             int64
Avg_viewers           int64
Avg_channels          int64
Avg_viewer_ratio    float64
dtype: object

In [21]:
twitch_raw_df

Unnamed: 0,Rank,Game,Month,Year,Hours_watched,Hours_Streamed,Peak_viewers,Peak_channels,Streamers,Avg_viewers,Avg_channels,Avg_viewer_ratio
50,101,battleblock theater,1,2016,332256,2041 hours,152739,19,1097,447,2,162.79
55,106,donkey kong country 2: diddy's kong quest,1,2016,316101,993 hours,181740,28,218,425,1,318.33
56,107,kirby 64: the crystal shards,1,2016,315368,209 hours,158893,16,58,424,0,1508.94
62,113,halo 4,1,2016,302218,569 hours,135671,13,71,406,0,531.14
63,114,kirby: squeak squad,1,2016,299628,60 hours,156564,16,28,403,0,4993.80
...,...,...,...,...,...,...,...,...,...,...,...,...
12745,196,cuphead,4,2021,795329,31721 hours,90364,98,7795,1106,44,25.07
12746,197,hades,4,2021,794472,52755 hours,36447,152,8663,1104,73,15.06
12747,198,epic seven,4,2021,778750,18362 hours,11670,47,997,1083,25,42.41
12748,199,pok<u+00e9>mon platinum,4,2021,777305,22380 hours,61580,76,2837,1081,31,34.73


In [22]:
# Make all names lower case
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.lower()
# Remove all special characters
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'[^\w ]+', '')

# Remove all duplicate whitespace
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.strip(r' ')

# # Convert roman numerals end sentence
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' ii$', ' 2')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' iii$', ' 3')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' iv$', ' 4')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' v$', ' 5')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' vi$', ' 6')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' vii$', ' 7')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' viii$', ' 8')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' ix$', ' 9')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' x$', ' 10')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xi$', '11')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xii$', '12')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xiii$', '13')
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace(r' xiv$', '14')


# Remove all unwanted values from the remove_list
twitch_raw_df['Game'] = twitch_raw_df['Game'].str.replace('|'.join(remove_list), '')

# Replace all keys from the replace_dict with their values within the dataframe
twitch_raw_df['Game'] = twitch_raw_df['Game'].replace(replace_dict, regex=True)

In [23]:
# Remove Redundant columns
twitch_raw_df.dtypes

Rank                  int64
Game                 object
Month                 int64
Year                  int64
Hours_watched         int64
Hours_Streamed       object
Peak_viewers          int64
Peak_channels         int64
Streamers             int64
Avg_viewers           int64
Avg_channels          int64
Avg_viewer_ratio    float64
dtype: object

In [24]:
# Something is different about this dataframe than the others. neither the .loc, str.replace, or lambda functions operate
# correctly here whereas they work in all other tables

# twitch_raw_df.Game = twitch_raw_df.Game.apply(lambda x: 'gothic 2' if 'gothic 2' in x else x)

# twitch_raw_df['Game'] = twitch_raw_df.loc[twitch_raw_df['Game'].str.contains('gothic 2'), 'name'] = 'gothic 2'
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*gothic 2.*$)', 'gothic 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*shoppe keep 2.*$)', 'shoppe keep 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*super seducer 2.*$)', 'super seducer 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*ys 8 lacrimosa of dana.*$)', 'ys 8 lacrimosa of dana')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*age of empires ii.*$)', 'age of empires ii')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*bionic commando.*$)', 'bionic commando')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*blazblue.*$)', 'blazblue')

# metacritic_raw_df Load, transform, clean

In [25]:
# Read in and preview metacritic data
metacritic_raw_df = pd.read_csv('original_datasets/metacritic_game_info.csv')
print(metacritic_raw_df.shape)
metacritic_raw_df.head()

(5000, 9)


Unnamed: 0.1,Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player
3,3,SoulCalibur,1999,Namco,Action;Fighting;3D,Dreamcast,98,8.6,1-2
4,4,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,Xbox360,98,7.9,1 Player


In [26]:
# Only keep PC game review records
metacritic_raw_df = metacritic_raw_df[metacritic_raw_df.Platform == 'PC']
print(metacritic_raw_df.shape)

(1381, 9)


In [27]:
# Count number of unique reviews
len(metacritic_raw_df.Title.unique())

1381

In [28]:
# Make all names lower case
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.lower()
# Remove all special characters
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r'[^\w ]+', '')

# Remove all duplicate whitespace
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.strip(r' ')

# # Convert roman numerals end sentence
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' ii$', ' 2')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' iii$', ' 3')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' iv$', ' 4')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' v$', ' 5')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' vi$', ' 6')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' vii$', ' 7')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' viii$', ' 8')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' ix$', ' 9')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' x$', ' 10')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xi$', '11')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xii$', '12')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xiii$', '13')
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace(r' xiv$', '14')


# Remove all unwanted values from the remove_list
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].str.replace('|'.join(remove_list), '')

# Replace all keys from the replace_dict with their values within the dataframe
metacritic_raw_df['Title'] = metacritic_raw_df['Title'].replace(replace_dict, regex=True)

In [29]:
# Remove Redundant columns

In [30]:
metacritic_raw_df.Title = metacritic_raw_df.Title.apply(lambda x: 'gothic 2' if 'gothic 2' in x else x)
# metacritic_raw_df['Title'] = metacritic_raw_df.loc[metacritic_raw_df['Title'].str.contains('gothic 2'), 'name'] = 'gothic 2'
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*gothic 2.*$)', 'gothic 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*shoppe keep 2.*$)', 'shoppe keep 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*super seducer 2.*$)', 'super seducer 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*ys 8 lacrimosa of dana.*$)', 'ys 8 lacrimosa of dana')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*age of empires ii.*$)', 'age of empires ii')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*shoppe keep 2.*$)', 'shoppe keep 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*bionic commando.*$)', 'bionic commando')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*shoppe keep 2.*$)', 'shoppe keep 2')
# steam_store_raw_df['name'] = steam_store_raw_df.name.str.replace(r'(^.*blazblue.*$)', 'blazblue')

# steamspy_tag Load, transform, clean

In [31]:
#### Read in and preview steamspy tag info
steamspy_tag_raw_df = pd.read_csv('original_datasets/steamspy_tag_data.csv')
print(steamspy_tag_raw_df.shape)
steamspy_tag_raw_df.head()

(29022, 372)


Unnamed: 0,appid,1980s,1990s,2.5d,2d,2d_fighter,360_video,3d,3d_platformer,3d_vision,...,warhammer_40k,web_publishing,werewolves,western,word_game,world_war_i,world_war_ii,wrestling,zombies,e_sports
0,10,144,564,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,550
1,20,0,71,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,5,122,0,0,0
3,40,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50,0,77,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# FINAL MERGE

In [32]:
# Standardize string merge column to 'Game'
steam_store_raw_df.rename(columns = {'name':'Game'}, inplace=True)
metacritic_raw_df.rename(columns = {'Title':'Game'}, inplace=True)

In [33]:
# Create list of dfs that will join on Game str
str_join_dfs = [steam_store_raw_df, metacritic_raw_df, twitch_raw_df]

In [34]:
# String merge tables join
from functools import reduce
str_merge_df = reduce(lambda left,right: pd.merge(left,right,on='Game'),str_join_dfs)

In [35]:
print(str_merge_df.shape)

(236, 34)


In [36]:
final_df = pd.merge(left=str_merge_df, right=steamspy_tag_raw_df, on='appid')

In [37]:
print(final_df.shape)
final_df.head()

(236, 405)


Unnamed: 0,appid,Game,release_date,english,developer,publisher,platforms,required_age,achievements,positive_ratings,...,warhammer_40k,web_publishing,werewolves,western,word_game,world_war_i,world_war_ii,wrestling,zombies,e_sports
0,220,halflife 2,2004-11-16,1,Valve,Valve,windows;mac;linux,0,33,67902,...,0,0,0,0,0,0,0,0,607,0
1,440,team fortress 2,2007-10-10,1,Valve,Valve,windows;mac;linux,0,520,515879,...,0,0,0,0,0,0,0,0,0,0
2,500,left 4 dead,2008-11-17,1,Valve,Valve,windows;mac,0,73,17951,...,0,0,0,0,0,0,0,0,1064,0
3,550,left 4 dead 2,2009-11-19,1,Valve,Valve,windows;mac;linux,0,70,251789,...,0,0,0,0,0,0,0,0,5334,0
4,570,dota 2,2013-07-09,1,Valve,Valve,windows;mac;linux,0,0,863507,...,0,0,0,0,0,0,0,0,0,4768


In [38]:
final_df.to_csv('for_ml_model.csv')

# Steam_Twitch Merge Testing

In [39]:
# # Output to be used towards blending the mockup database
# steam_twitch = pd.merge(left=steam_store_raw_df, right=twitch_raw_df, left_on='name', right_on='Game')
# steam_twitch.shape
# steam_twitch.to_csv('explore_csvs/steam_twitch_merge.csv')

KeyError: 'name'

In [None]:
steam_twitch.shape

In [None]:
steam_twitch.head()

In [None]:
twitch_steam_unmatched = pd.merge(left=twitch_raw_df, right=steam_store_raw_df, how='left', left_on='Game', right_on='name')

In [None]:
drop_condition_right = twitch_steam_unmatched['Game'].isin(steam_twitch['name'])
twitch_steam_unmatched.drop(twitch_steam_unmatched[drop_condition_right].index, inplace = True)
print(twitch_steam_unmatched.shape)
twitch_steam_unmatched.to_csv('explore_csvs/twitch_steam_unmatched.csv')

In [None]:
# steam_unmatched = pd.merge(left=steam_store_raw_df, right=twitch_raw_df, how='left', left_on='name', right_on='Game')
# steam_unmatched.shape
# steam_unmatched

In [None]:
# drop_condition_left = steam_unmatched['name'].isin(steam_twitch['name'])
# steam_unmatched.drop(steam_unmatched[drop_condition_left].index, inplace = True)
# print(steam_unmatched.shape)
# steam_unmatched.to_csv('explore_csvs/steam_unmatched.csv')

# Meta_Twitch Merge testing

In [None]:
# Output to be used towards blending the mockup database
meta_twitch = pd.merge(left=metacritic_raw_df, right=twitch_raw_df, left_on='Title', right_on='Game')
meta_twitch.shape
meta_twitch.to_csv('explore_csvs/meta_twitch_merge.csv')

In [None]:
meta_twitch.shape

In [None]:
twitch_meta_unmatched = pd.merge(left=twitch_raw_df, right=metacritic_raw_df, how='left', left_on='Game', right_on='Title')

In [None]:
drop_condition_right = twitch_meta_unmatched['Game'].isin(meta_twitch['Title'])
twitch_meta_unmatched.drop(twitch_meta_unmatched[drop_condition_right].index, inplace = True)
print(twitch_meta_unmatched.shape)
twitch_meta_unmatched.to_csv('explore_csvs/twitch_meta_unmatched.csv')

In [None]:
meta_unmatched = pd.merge(left=metacritic_raw_df, right=twitch_raw_df, how='left', left_on='Title', right_on='Game')
meta_unmatched.shape
meta_unmatched

In [None]:
drop_condition_left = meta_unmatched['Title'].isin(meta_twitch['Game'])
meta_unmatched.drop(meta_unmatched[drop_condition_left].index, inplace = True)
print(meta_unmatched.shape)
meta_unmatched.to_csv('explore_csvs/meta_unmatched.csv')