In [2]:
import pandas as pd
import numpy as np
from janitor import clean_names
from pyprojroot import here

In [195]:
sales_2019 = pd.read_csv(here("./raw_data/sales-2019.csv")).clean_names()
sales_2016 = pd.read_csv(here("./raw_data/sales-2016-with-ratings.csv")).clean_names()
sales_xbox = pd.read_csv(here("./raw_data/xbox-one-game-sales.csv")).clean_names()
sales_ps4 = pd.read_csv(here("./raw_data/ps4-game-sales.csv")).clean_names()

In [12]:
sales_2019.columns

Index(['rank', 'name', 'basename', 'genre', 'esrb_rating', 'platform',
       'publisher', 'developer', 'vgchartz_score', 'critic_score',
       'user_score', 'total_shipped', 'global_sales', 'na_sales', 'pal_sales',
       'jp_sales', 'other_sales', 'year', 'last_update', 'url', 'status',
       'vgchartzscore', 'img_url'],
      dtype='object')

In [13]:
sales_2019.describe()

Unnamed: 0,rank,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,jp_sales,other_sales,year,status,vgchartzscore
count,55792.0,0.0,6536.0,335.0,1827.0,19415.0,12964.0,13189.0,7043.0,15522.0,54813.0,55792.0,799.0
mean,27896.5,,7.213709,8.253433,1.887258,0.365503,0.275541,0.155263,0.110402,0.044719,2005.659095,1.0,7.425907
std,16105.907446,,1.454079,1.401489,4.195693,0.833022,0.512809,0.399257,0.184673,0.129554,8.355585,0.0,1.384226
min,1.0,,1.0,2.0,0.03,0.0,0.0,0.0,0.0,0.0,1970.0,1.0,2.6
25%,13948.75,,6.4,7.8,0.2,0.03,0.05,0.01,0.02,0.0,2000.0,1.0,6.8
50%,27896.5,,7.5,8.5,0.59,0.12,0.12,0.04,0.05,0.01,2008.0,1.0,7.8
75%,41844.25,,8.3,9.1,1.8,0.36,0.29,0.14,0.12,0.04,2011.0,1.0,8.5
max,55792.0,,10.0,10.0,82.86,20.32,9.76,9.85,2.69,3.12,2020.0,1.0,9.6


In [14]:
sales_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55792 entries, 0 to 55791
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rank            55792 non-null  int64  
 1   name            55792 non-null  object 
 2   basename        55792 non-null  object 
 3   genre           55792 non-null  object 
 4   esrb_rating     23623 non-null  object 
 5   platform        55792 non-null  object 
 6   publisher       55792 non-null  object 
 7   developer       55775 non-null  object 
 8   vgchartz_score  0 non-null      float64
 9   critic_score    6536 non-null   float64
 10  user_score      335 non-null    float64
 11  total_shipped   1827 non-null   float64
 12  global_sales    19415 non-null  float64
 13  na_sales        12964 non-null  float64
 14  pal_sales       13189 non-null  float64
 15  jp_sales        7043 non-null   float64
 16  other_sales     15522 non-null  float64
 17  year            54813 non-null 

In [196]:
# drop columns we won't be needing
sales_2019.drop(['basename', 'vgchartz_score', 'critic_score', 'user_score', 'last_update', 'url', 'status', \
                 'vgchartzscore', 'img_url'], axis=1, inplace=True)

In [64]:
sales_2019.isna().sum()

rank                 0
name                 0
genre                0
esrb_rating      32169
platform             0
publisher            0
developer           17
total_shipped    53965
global_sales     36377
na_sales         42828
pal_sales        42603
jp_sales         48749
other_sales      40270
year               979
dtype: int64

In [197]:
# replace NAs in global_sales with corresponding value from total_shipped
sales_2019['global_sales'] = sales_2019['global_sales'].fillna(sales_2019.pop('total_shipped'))

In [17]:
sales_2019.dtypes

rank              int64
name             object
basename         object
genre            object
esrb_rating      object
platform         object
publisher        object
developer        object
critic_score    float64
user_score      float64
global_sales    float64
na_sales        float64
pal_sales       float64
jp_sales        float64
other_sales     float64
year              int64
dtype: object

In [198]:
# replace NAs in year column with 0
sales_2019['year'] = sales_2019['year'].fillna(0)

In [199]:
# convert year column from float to integer
sales_2019["year"] = sales_2019["year"].astype(int)

In [200]:
# drop all rows where global_sales through other_sales are all NaN
sales_2019.dropna(subset=sales_2019.columns[7:11], how='all', inplace=True)

In [36]:
sales_2019.loc[sales_2019["global_sales"] >= 0.1]

Unnamed: 0,rank,name,basename,genre,esrb_rating,platform,publisher,developer,critic_score,user_score,global_sales,na_sales,pal_sales,jp_sales,other_sales,year
0,1,Wii Sports,wii-sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,82.86,,,,,2006
1,2,Super Mario Bros.,super-mario-bros,Platform,,NES,Nintendo,Nintendo EAD,10.0,,40.24,,,,,1985
2,3,Mario Kart Wii,mario-kart-wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,37.14,,,,,2008
3,4,PlayerUnknown's Battlegrounds,playerunknowns-battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,36.60,,,,,2017
4,5,Wii Sports Resort,wii-sports-resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,33.09,,,,,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12637,12638,Slime Rancher,slime-rancher,Simulation,E10,XOne,Skybound Games,Monomi Park,,,0.10,0.08,,,0.01,2018
12638,12639,thinkSMART,thinksmart,Misc,E,DS,Conspiracy Entertainment,Conspiracy Entertainment,,,0.10,0.09,,,0.01,2010
12639,12640,Disney Planes Fire & Rescue,disney-planes-fire-amp-rescue,Action,E,3DS,Disney Interactive Studios,Little Orbit,,,0.10,0.01,0.08,,0.01,2014
12640,12641,Lost Dimension,lost-dimension,Role-Playing,T,PSV,Atlus,Lancarse,,,0.10,0.04,0.02,0.02,0.02,2015


In [127]:
sales_2019.platform.unique()

array(['Wii', 'NES', 'PC', 'GB', 'DS', 'X360', 'SNES', 'PS3', 'PS4',
       '3DS', 'PS2', 'GBA', 'NS', 'GEN', 'N64', 'PS', 'XOne', 'WiiU',
       'XB', 'PSP', '2600', 'GC', 'GBC', 'PSN', 'PSV', 'DC', 'SAT', 'SCD',
       'WS', 'XBL', 'Amig', 'VC', 'NG', 'WW', 'PCE', '3DO', 'GG', 'OSX',
       'PCFX', 'Mob'], dtype=object)

In [128]:
sales_2019.groupby(['platform']).size()

platform
2600     133
3DO        4
3DS      646
Amig       1
DC        52
DS      2511
GB        94
GBA      937
GBC        9
GC       564
GEN       31
GG         1
Mob        4
N64      322
NES      103
NG        12
NS       298
OSX        1
PC      2785
PCE        2
PCFX       1
PS      1211
PS2     2247
PS3     1411
PS4      957
PSN       16
PSP     1401
PSV      687
SAT      175
SCD        6
SNES     242
VC         6
WS         7
WW         3
Wii     1437
WiiU     170
X360    1349
XB       850
XBL       10
XOne     546
dtype: int64

In [201]:
# replace NAs in year column with 0
sales_2016['year_of_release'] = sales_2016['year_of_release'].fillna(0)

In [202]:
# convert year_of_release column from float to integer
sales_2016["year_of_release"] = sales_2016["year_of_release"].astype(int)

In [203]:
# drop all rows where critic_score through user_count are all NaN
sales_2016.dropna(subset=sales_2016.columns[10:14], how='all', inplace=True)

In [204]:
# drop columns we won't be needing
sales_2016 = sales_2016.loc[:, ['name', 'platform', 'critic_score', 'user_score']]

In [205]:
# convert user_score from object to float in order to turn all "tbd" into NaN
sales_2016['user_score'] = pd.to_numeric(sales_2016['user_score'],errors = 'coerce')

In [206]:
# drop all rows where both critic_score and user_score are NaN
sales_2016.dropna(subset=sales_2016.columns[2:4], how='all', inplace=True)

In [207]:
# create joined dataset with ratings from sales_2016
sales_joined = sales_2019.merge(sales_2016, how='left', on=['name', 'platform'])

In [164]:
sales_joined.groupby(['platform']).size()

platform
2600     133
3DS      646
DC        52
DS      2511
GB        94
GBA      937
GBC        9
GC       564
GEN       31
N64      322
NES      103
NG        12
NS       298
PC      2786
PS      1211
PS2     2247
PS3     1412
PS4      957
PSN       16
PSP     1401
PSV      687
SAT      175
SNES     242
Wii     1437
WiiU     170
X360    1350
XB       850
XBL       10
XOne     546
dtype: int64

In [208]:
# remove irrelevant platforms
sales_joined = sales_joined.apply(lambda row: row[~sales_joined['platform']
                                                  .isin(['3DO','Amig', 'GG', 'Mob', \
                                                         'OSX', 'PCE', 'PCFX', 'SCD', 'VC', 'WS', 'WW'])])

In [209]:
# rename platforms for clarity
sales_joined["platform"].replace({"2600": "Atari 2600", "3DS": "Nintendo 3DS", "DS": "Nintendo DS", \
                                  "DC": "Sega Dreamcast", "GB": "Game Boy", "GBA": "Game Boy Advance", \
                                  "GBC": "Game Boy Color", "GC": "GameCube", "GEN": "Sega Genesis", \
                                  "N64": "Nintendo 64", "NG": "Neo Geo", "NS": "Nintendo Switch", \
                                  "PS": "PlayStation", "PS2": "PlayStation 2", "PS3": "PlayStation 3", \
                                  "PS4": "PlayStation 4", "PSN": "PlayStation Network", \
                                  "PSP": "PlayStation Portable", "PSV": "PlayStation Vita", \
                                  "SAT": "Sega Saturn", "Wii": "Nintendo Wii", "WiiU": "Nintendo Wii U", \
                                  "X360": "Xbox 360", "XB": "Xbox", "XBL": "Xbox Live", "XOne": "Xbox One"}, \
                                 inplace=True)

In [210]:
sales_joined.groupby(['platform']).size()

platform
Atari 2600               133
Game Boy                  94
Game Boy Advance         937
Game Boy Color             9
GameCube                 564
NES                      103
Neo Geo                   12
Nintendo 3DS             646
Nintendo 64              322
Nintendo DS             2511
Nintendo Switch          298
Nintendo Wii            1437
Nintendo Wii U           170
PC                      2786
PlayStation             1211
PlayStation 2           2247
PlayStation 3           1412
PlayStation 4            957
PlayStation Network       16
PlayStation Portable    1401
PlayStation Vita         687
SNES                     242
Sega Dreamcast            52
Sega Genesis              31
Sega Saturn              175
Xbox                     850
Xbox 360                1350
Xbox Live                 10
Xbox One                 546
dtype: int64

In [212]:
sales_joined

Unnamed: 0,rank,name,genre,esrb_rating,platform,publisher,developer,global_sales,na_sales,pal_sales,jp_sales,other_sales,year,critic_score,user_score
0,1,Wii Sports,Sports,E,Nintendo Wii,Nintendo,Nintendo EAD,82.86,,,,,2006,76.0,8.0
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,40.24,,,,,1985,,
2,3,Mario Kart Wii,Racing,E,Nintendo Wii,Nintendo,Nintendo EAD,37.14,,,,,2008,82.0,8.3
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,36.60,,,,,2017,,
4,5,Wii Sports Resort,Sports,E,Nintendo Wii,Nintendo,Nintendo EAD,33.09,,,,,2009,80.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19860,19858,FirePower for Microsoft Combat Flight Simulator 3,Simulation,T,PC,GMX Media,Shockwave Productions,0.01,,0.0,,0.0,2004,,
19861,19859,Tom Clancy's Splinter Cell,Shooter,T,PC,Ubisoft,Ubisoft,0.01,,0.0,,0.0,2003,,
19862,19860,Ashita no Joe 2: The Anime Super Remix,Fighting,,PlayStation 2,Capcom,Capcom,0.01,,,0.01,,2002,,
19863,19861,Tokyo Yamanote Boys for V: Main Disc,Adventure,,PlayStation Vita,Rejet,Rejet,0.01,,,0.01,,2017,,


In [211]:
# remove games with no recorded sales
sales_joined = sales_joined.loc[sales_joined["global_sales"] > 0]

In [213]:
sales_xbox

Unnamed: 0,pos,game,year,genre,publisher,north_america,europe,japan,rest_of_world,global
0,1,Grand Theft Auto V,2014.0,Action,Rockstar Games,4.70,3.25,0.01,0.76,8.72
1,2,Call of Duty: Black Ops 3,2015.0,Shooter,Activision,4.63,2.04,0.02,0.68,7.37
2,3,Call of Duty: WWII,2017.0,Shooter,Activision,3.75,1.91,0.00,0.57,6.23
3,4,Red Dead Redemption 2,2018.0,Action-Adventure,Rockstar Games,3.76,1.47,0.00,0.54,5.77
4,5,MineCraft,2014.0,Misc,Microsoft Studios,3.23,1.71,0.00,0.49,5.43
...,...,...,...,...,...,...,...,...,...,...
608,609,Biomutant,2018.0,Action,THQ Nordic,0.00,0.00,0.00,0.00,0.00
609,610,Biomutant,2019.0,Action,THQ Nordic,0.00,0.00,0.00,0.00,0.00
610,611,de Blob,2017.0,Platform,THQ Nordic,0.00,0.00,0.00,0.00,0.00
611,612,Outcast: Second Contact,2017.0,Adventure,Bigben Interactive,0.00,0.00,0.00,0.00,0.00
