In [1]:
# Import the requests library
import pandas as pd
import re

# Steam_store_data Load, transform, clean

In [2]:
#### Read in and preview Gamespot Reviews
steam_all_df = pd.read_csv('original_datasets/steam_all.csv')
print(steam_all_df.shape)
steam_all_df.head()

(49964, 393)


Unnamed: 0.1,Unnamed: 0,time,appid,name,review,voted_up,release_date,english,developer,publisher,...,year,month,gamename,avg,gain,peak,avg_peak_perc,windows,mac,linux
0,1,2012-08-31,20,Team Fortress Classic,4.0,1.0,1999-04-01,1,Valve,Valve,...,2012,8,Team Fortress Classic,93.03,-5.14,189,0.492222,1,1,1
1,2,2012-09-30,20,Team Fortress Classic,2.0,0.5,1999-04-01,1,Valve,Valve,...,2012,9,Team Fortress Classic,83.83,-9.2,173,0.484566,1,1,1
2,3,2012-10-31,20,Team Fortress Classic,3.0,1.0,1999-04-01,1,Valve,Valve,...,2012,10,Team Fortress Classic,77.51,-6.32,179,0.433017,1,1,1
3,4,2012-11-30,20,Team Fortress Classic,2.0,1.0,1999-04-01,1,Valve,Valve,...,2012,11,Team Fortress Classic,78.58,1.07,169,0.46497,1,1,1
4,5,2012-12-31,20,Team Fortress Classic,6.0,1.0,1999-04-01,1,Valve,Valve,...,2012,12,Team Fortress Classic,87.79,9.21,191,0.459634,1,1,1


In [3]:
# Eliminate special characters found in column headers
steam_all_df.columns=steam_all_df.columns.str.replace('	','')

In [4]:
print(steam_all_df.shape)

(49964, 393)


In [5]:
# Make all names lower case
steam_all_df['name'] = steam_all_df['name'].str.lower()
# Remove all special characters
steam_all_df['name'] = steam_all_df['name'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
steam_all_df['name'] = steam_all_df['name'].str.replace(r'[^\w ]+', '')
# Remove all duplicate whitespace
steam_all_df['name'] = steam_all_df['name'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
steam_all_df['name'] = steam_all_df['name'].str.strip(r' ')

In [6]:
# Roman numeral dict for replacement
rom_to_int_dict = {r' ii$': ' 2',
                  r' iii$': ' 3',
                  r' iii$': ' 4',
                  r' v$': ' 5',
                  r' vi$': ' 6',
                  r' vii$': ' 7',
                  r' viii$': ' 8',
                  r' ix$': ' 9',
                  r' x$': ' 10',
                  r' xi$': '11',
                  r' xii$': '12',
                  r' xiii$': '13',
                  r' xiv$': '14'}

In [7]:
# Roman numeral to integer conversion
steam_all_df['name'] = steam_all_df['name'].replace(rom_to_int_dict, regex=True)

In [8]:
# Create a list of values that block matching between steam and twitch for removal
remove_list = (' hd', ' remastered', ' directors cut edition', ' game of the year enhanced', 
               ' aot wings of freedom', ' aot 2', ' remastered', ' expansion', ' hd edition',
              ' board game with online multiplayer', ' gold edition', ' game of the year edition',' creative survival',
              ' ultimate edition',' complete edition',' special edition',' enhanced edition',' definitive edition',
              ' windows edition',' redux',' classic hd',' millennium edition',' deluxe edition',' steam edition',
              ' directors cut', ' the telltale series', ' a realm reborn', ' night of the raven', ' a telltale game series',
              '00db', ' bfg edition', ' sol', ' guacamelee super turbo championship edition', ' lightspeed edition',
              ' the complete edition', ' a criterion game', ' ultimate hd edition', ' hd remastered',
              ' warlord edition', ' arcade edition')

In [9]:
# Remove all unwanted values from the remove_list
steam_all_df['name'] = steam_all_df['name'].str.replace('|'.join(remove_list), '')

In [10]:
# Create a dictionary of values that block matching between steam and twitch for replacement
replace_dict = {'catherine classic': 'catherine', 
                'creativedestruction': 'creative destruction', 
                'eternal return: black survival': 'eternal return',
                'heroes of might and magic 3 the shadow of death': 'heroes of might and magic 3',
                'orcs must die unchained': 'orcs must die',
                'puyo puyotetris': 'puyo puyo tetris',
                'life is strange episode 1': 'life is strange',
                'resident evil 4 biohazard 4': 'resident evil 4',
                'resident evil 2 biohazard re2': 'resident evil 2',
                'resident evil 6 biohazard 6': 'resident evil 6',
                'resident evil 7 biohazard 7': 'resident evil 7 biohazard',
                'rollercoaster tycoon 2 triple thrill pack': 'rollercoaster tycoon 2',
                'romance of the three kingdoms 13 13': 'romance of the three kingdoms 13',
                'shadowverse ccg': 'shadowverse',
                'soulworker anime action mmo': 'soulworker',
                'super seducer how to talk': 'super seducer',
                'the bards tale 5 barrows deep': 'the bards tale 5',
                'puyo puyotetris': 'puyo puyo tetris',
                'x x2 remastered': 'xx2 remastered',
                'hitman 2 silent assassin': 'hitman 2',
                'hitman episode 3 marrakesh': 'hitman 3',
                'la noire the': 'la noire',
                'life is strange before the storm episode 1 awake': 'life is strange before the storm',
               'microsoft flight simulator 2004 a century of flight': 'microsoft flight simulator',
               'sid meiers civilization vi rise fall': 'sid meiers civilization vi rise and fall'}

In [11]:
# Replace all keys from the replace_dict with their values within the dataframe
steam_all_df['name'] = steam_all_df['name'].replace(replace_dict, regex=True)

In [12]:
for col in steam_all_df.columns: 
    print(col)

Unnamed: 0
time
appid
name
review
voted_up
release_date
english
developer
publisher
achievements
recommended
1980s
1990s
2.5d
2d
2d_fighter
360_video
3d
3d_platformer
3d_vision
4_player_local
4x
6dof
atv
abstract
action
action_rpg
action_adventure
addictive
adventure
agriculture
aliens
alternate_history
america
animation_&_modeling
anime
arcade
arena_shooter
artificial_intelligence
assassin
asynchronous_multiplayer
atmospheric
audio_production
bmx
base_building
baseball
based_on_a_novel
basketball
batman
battle_royale
beat_em_up
beautiful
benchmark
bikes
blood
board_game
bowling
building
bullet_hell
bullet_time
crpg
capitalism
card_game
cartoon
cartoony
casual
cats
character_action_game
character_customization
chess
choices_matter
choose_your_own_adventure
cinematic
city_builder
class_based
classic
clicker
co_op
co_op_campaign
cold_war
colorful
comedy
comic_book
competitive
conspiracy
controller
conversation
crafting
crime
crowdfunded
cult_classic
cute
cyberpunk
cycling
dark
dark_comed

In [13]:
print(steam_all_df.shape)

(49964, 393)


In [14]:
#put all column names to a list for future for loop use to go thru each of the columns in the list
column_names_list = steam_all_df.columns.to_list()

In [15]:
#create an empty list to have the columns added to it (for dropping later)
li = []

#create a function to add in columns that's meeting a certain criteria
def columns_ready_to_drop(col_name):
    #if column value type is int, continue the following, if not, ignore.
    if steam_all_df[col_name].dtypes == "int64":

        #if dataframe col_name has 0 values more than 80% of the number of rows, save in a list, and ready to drop
        non_zero_rows = steam_all_df[col_name].apply(lambda r: True if r > 0 else False)
        numberOfRows = len(non_zero_rows[non_zero_rows == True].index)
        #testing
        #print(numberOfRows)
        #When the percentage below is increasing, it means the column has to have less zero rows to be able to remain in the new database. 
        # 0.2 means if non-zero rows are only 20% or less of the total rows, the column will be add to the drop list.
        if numberOfRows <= 0.20 *len(steam_all_df):
            li.append(col_name)
        else:
            pass
    else:
        pass
    return li

In [16]:
# use the function to run thru all columns
for col in column_names_list:
    columns_ready_to_drop(col)
    
print(li)

['1980s', '1990s', '2.5d', '2d_fighter', '360_video', '3d', '3d_platformer', '3d_vision', '4_player_local', '4x', '6dof', 'atv', 'abstract', 'action_rpg', 'action_adventure', 'addictive', 'agriculture', 'aliens', 'alternate_history', 'america', 'animation_&_modeling', 'anime', 'arcade', 'arena_shooter', 'artificial_intelligence', 'assassin', 'asynchronous_multiplayer', 'audio_production', 'bmx', 'base_building', 'baseball', 'based_on_a_novel', 'basketball', 'batman', 'battle_royale', 'beat_em_up', 'beautiful', 'benchmark', 'bikes', 'blood', 'board_game', 'bowling', 'building', 'bullet_hell', 'bullet_time', 'crpg', 'capitalism', 'card_game', 'cartoon', 'cartoony', 'cats', 'character_action_game', 'character_customization', 'chess', 'choices_matter', 'choose_your_own_adventure', 'cinematic', 'city_builder', 'class_based', 'classic', 'clicker', 'co_op_campaign', 'cold_war', 'colorful', 'comedy', 'comic_book', 'competitive', 'conspiracy', 'controller', 'conversation', 'crafting', 'crime', 

In [17]:
print(steam_all_df.shape)

(49964, 393)


In [18]:
#dropping columns when the dtypes of the columns is int64 and there are more than 80% of values being 0.
# tried 80%, 50%, 45% non-zero rows, but it's dropping too many columns, so chose 20% as the cut off for now, this can be changed.
steam_all_df = steam_all_df.drop(columns=li)
steam_all_df

Unnamed: 0.1,Unnamed: 0,time,appid,name,review,voted_up,release_date,english,developer,publisher,...,year,month,gamename,avg,gain,peak,avg_peak_perc,windows,mac,linux
0,1,2012-08-31,20,team fortress classic,4.0,1.000000,1999-04-01,1,Valve,Valve,...,2012,8,Team Fortress Classic,93.03,-5.14,189,0.492222,1,1,1
1,2,2012-09-30,20,team fortress classic,2.0,0.500000,1999-04-01,1,Valve,Valve,...,2012,9,Team Fortress Classic,83.83,-9.20,173,0.484566,1,1,1
2,3,2012-10-31,20,team fortress classic,3.0,1.000000,1999-04-01,1,Valve,Valve,...,2012,10,Team Fortress Classic,77.51,-6.32,179,0.433017,1,1,1
3,4,2012-11-30,20,team fortress classic,2.0,1.000000,1999-04-01,1,Valve,Valve,...,2012,11,Team Fortress Classic,78.58,1.07,169,0.464970,1,1,1
4,5,2012-12-31,20,team fortress classic,6.0,1.000000,1999-04-01,1,Valve,Valve,...,2012,12,Team Fortress Classic,87.79,9.21,191,0.459634,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49959,50433,2021-01-31,313120,stranded deep,295.0,0.756757,2015-01-23,1,Beam Team Games,Beam Team Pty Ltd,...,2021,1,Stranded Deep,747.84,205.91,1397,0.535319,1,1,1
49960,50434,2021-02-28,313120,stranded deep,187.0,0.802139,2015-01-23,1,Beam Team Games,Beam Team Pty Ltd,...,2021,2,Stranded Deep,542.77,-205.06,993,0.546596,1,1,1
49961,50435,2020-11-30,738060,freddy fazbears pizzeria simulator,1.0,1.000000,2017-12-04,1,Scott Cawthon,Scott Cawthon,...,2020,11,Freddy Fazbear's Pizzeria Simulator,114.97,9.19,227,0.506476,1,0,0
49962,50436,2020-12-31,738060,freddy fazbears pizzeria simulator,1.0,1.000000,2017-12-04,1,Scott Cawthon,Scott Cawthon,...,2020,12,Freddy Fazbear's Pizzeria Simulator,108.30,-6.66,207,0.523188,1,0,0


# all_twitch Load, transform, clean

In [19]:
#### Read in and preview twitch data
twitch_df = pd.read_csv('original_datasets/twitch_new.csv')
print(twitch_df.shape)

(1115596, 13)


In [20]:
# Make all names lower case
twitch_df['Game'] = twitch_df['Game'].str.lower()
# Remove all special characters
twitch_df['Game'] = twitch_df['Game'].str.replace(r'[^\x00-\x7f]', '')
# Remove all non-alpha-numeric
twitch_df['Game'] = twitch_df['Game'].str.replace(r'[^\w ]+', '')

# Remove all duplicate whitespace
twitch_df['Game'] = twitch_df['Game'].str.replace(r'\s+', ' ')
# Remove leading and trailing whitespace
twitch_df['Game'] = twitch_df['Game'].str.strip(r' ')

# Roman numeral to integer conversion
twitch_df['Game'] = twitch_df['Game'].replace(rom_to_int_dict, regex=True)
# Remove all unwanted values from the remove_list
twitch_df['Game'] = twitch_df['Game'].str.replace('|'.join(remove_list), '')
# Replace all keys from the replace_dict with their values within the dataframe
twitch_df['Game'] = twitch_df['Game'].replace(replace_dict, regex=True)

In [21]:
# Add prefix for easy ID downstream
twitch_df = twitch_df.add_prefix('tw_')

In [22]:
print(twitch_df.shape)

(1115596, 13)


# MERGE

In [23]:
# Standardize string merge column to 'Game'
steam_all_df.rename(columns = {'name':'Game'}, inplace=True)
twitch_df.rename(columns = {'tw_Game':'Game'}, inplace=True)

In [24]:
# Merge Twitch and Steam Data
df = pd.merge(left=steam_all_df, right=twitch_df, on='Game')

In [25]:
unique = df.Game.nunique()

In [26]:
unique

787

In [27]:
print(df.shape)

(2852404, 58)
