In [10]:
import pandas as pd
import numpy as np
from janitor import clean_names
from pyprojroot import here

In [53]:
sales_2019 = pd.read_csv(here("./raw_data/sales-2019.csv")).clean_names()
sales_2016 = pd.read_csv(here("./raw_data/sales-2016-with-ratings.csv")).clean_names()
sales_xbox = pd.read_csv(here("./raw_data/xbox-one-game-sales.csv")).clean_names()
sales_ps4 = pd.read_csv(here("./raw_data/ps4-game-sales.csv")).clean_names()

In [54]:
# drop columns we won't be needing
sales_2019.drop(['basename', 'vgchartz_score', 'critic_score', 'user_score', 'last_update', 'url', 'status', \
                 'vgchartzscore', 'img_url'], axis=1, inplace=True)

In [55]:
sales_2019.isna().sum()

rank                 0
name                 0
genre                0
esrb_rating      32169
platform             0
publisher            0
developer           17
total_shipped    53965
global_sales     36377
na_sales         42828
pal_sales        42603
jp_sales         48749
other_sales      40270
year               979
dtype: int64

In [56]:
# replace NAs in global_sales with corresponding value from total_shipped
sales_2019['global_sales'] = sales_2019['global_sales'].fillna(sales_2019.pop('total_shipped'))

In [57]:
sales_2019.dtypes

rank              int64
name             object
genre            object
esrb_rating      object
platform         object
publisher        object
developer        object
global_sales    float64
na_sales        float64
pal_sales       float64
jp_sales        float64
other_sales     float64
year            float64
dtype: object

In [58]:
# replace NAs in year column with 0
sales_2019['year'] = sales_2019['year'].fillna(0)

In [59]:
# convert year column from float to integer
sales_2019["year"] = sales_2019["year"].astype(int)

In [60]:
sales_2019.columns[7:12]

Index(['global_sales', 'na_sales', 'pal_sales', 'jp_sales', 'other_sales'], dtype='object')

In [61]:
# drop all rows where global_sales through other_sales are all NaN
sales_2019.dropna(subset=sales_2019.columns[7:12], how='all', inplace=True)

In [62]:
sales_2019.loc[sales_2019["global_sales"] >= 0.1]

Unnamed: 0,rank,name,genre,esrb_rating,platform,publisher,developer,global_sales,na_sales,pal_sales,jp_sales,other_sales,year
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,82.86,,,,,2006
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,40.24,,,,,1985
2,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,37.14,,,,,2008
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,36.60,,,,,2017
4,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,33.09,,,,,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12637,12638,Slime Rancher,Simulation,E10,XOne,Skybound Games,Monomi Park,0.10,0.08,,,0.01,2018
12638,12639,thinkSMART,Misc,E,DS,Conspiracy Entertainment,Conspiracy Entertainment,0.10,0.09,,,0.01,2010
12639,12640,Disney Planes Fire & Rescue,Action,E,3DS,Disney Interactive Studios,Little Orbit,0.10,0.01,0.08,,0.01,2014
12640,12641,Lost Dimension,Role-Playing,T,PSV,Atlus,Lancarse,0.10,0.04,0.02,0.02,0.02,2015


In [63]:
sales_2019.platform.unique()

array(['Wii', 'NES', 'PC', 'GB', 'DS', 'X360', 'SNES', 'PS3', 'PS4',
       '3DS', 'PS2', 'GBA', 'NS', 'GEN', 'N64', 'PS', 'XOne', 'WiiU',
       'XB', 'PSP', '2600', 'GC', 'GBC', 'PSN', 'PSV', 'DC', 'SAT', 'SCD',
       'WS', 'XBL', 'Amig', 'VC', 'NG', 'WW', 'PCE', '3DO', 'GG', 'OSX',
       'PCFX', 'Mob'], dtype=object)

In [64]:
sales_2019.groupby(['platform']).size()

platform
2600     133
3DO        4
3DS      646
Amig       1
DC        52
DS      2511
GB        94
GBA      937
GBC        9
GC       564
GEN       31
GG         1
Mob        4
N64      322
NES      103
NG        12
NS       298
OSX        1
PC      2785
PCE        2
PCFX       1
PS      1211
PS2     2247
PS3     1411
PS4      957
PSN       16
PSP     1401
PSV      687
SAT      175
SCD        6
SNES     242
VC         6
WS         7
WW         3
Wii     1437
WiiU     170
X360    1349
XB       850
XBL       10
XOne     546
dtype: int64

In [65]:
# replace NAs in year column with 0
sales_2016['year_of_release'] = sales_2016['year_of_release'].fillna(0)

In [66]:
# convert year_of_release column from float to integer
sales_2016["year_of_release"] = sales_2016["year_of_release"].astype(int)

In [67]:
sales_2016.columns[10:14]

Index(['critic_score', 'critic_count', 'user_score', 'user_count'], dtype='object')

In [68]:
# drop all rows where critic_score through user_count are all NaN
sales_2016.dropna(subset=sales_2016.columns[10:14], how='all', inplace=True)

In [69]:
# drop columns we won't be needing
sales_2016 = sales_2016.loc[:, ['name', 'platform', 'critic_score', 'user_score']]

In [70]:
sales_2016.columns

Index(['name', 'platform', 'critic_score', 'user_score'], dtype='object')

In [71]:
# convert user_score from object to float in order to turn all "tbd" into NaN
sales_2016['user_score'] = pd.to_numeric(sales_2016['user_score'],errors = 'coerce')

In [72]:
sales_2016.columns[2:4]

Index(['critic_score', 'user_score'], dtype='object')

In [73]:
# drop all rows where both critic_score and user_score are NaN
sales_2016.dropna(subset=sales_2016.columns[2:4], how='all', inplace=True)

In [74]:
# create joined dataset with ratings from sales_2016
sales_joined = sales_2019.merge(sales_2016, how='left', on=['name', 'platform'])

In [75]:
sales_joined.groupby(['platform']).size()

platform
2600     133
3DO        4
3DS      646
Amig       1
DC        52
DS      2511
GB        94
GBA      937
GBC        9
GC       564
GEN       31
GG         1
Mob        4
N64      322
NES      103
NG        12
NS       298
OSX        1
PC      2786
PCE        2
PCFX       1
PS      1211
PS2     2247
PS3     1412
PS4      957
PSN       16
PSP     1401
PSV      687
SAT      175
SCD        6
SNES     242
VC         6
WS         7
WW         3
Wii     1437
WiiU     170
X360    1350
XB       850
XBL       10
XOne     546
dtype: int64

In [76]:
# remove irrelevant platforms
sales_joined = sales_joined.apply(lambda row: row[~sales_joined['platform']
                                                  .isin(['3DO','Amig', 'GG', 'Mob', \
                                                         'OSX', 'PCE', 'PCFX', 'SCD', 'VC', 'WS', 'WW'])])

In [77]:
# rename platforms for clarity
sales_joined["platform"].replace({"2600": "Atari 2600", "3DS": "Nintendo 3DS", "DS": "Nintendo DS", \
                                  "DC": "Sega Dreamcast", "GB": "Game Boy", "GBA": "Game Boy Advance", \
                                  "GBC": "Game Boy Color", "GC": "GameCube", "GEN": "Sega Genesis", \
                                  "N64": "Nintendo 64", "NG": "Neo Geo", "NS": "Nintendo Switch", \
                                  "PS": "PlayStation", "PS2": "PlayStation 2", "PS3": "PlayStation 3", \
                                  "PS4": "PlayStation 4", "PSN": "PlayStation Network", \
                                  "PSP": "PlayStation Portable", "PSV": "PlayStation Vita", \
                                  "SAT": "Sega Saturn", "Wii": "Nintendo Wii", "WiiU": "Nintendo Wii U", \
                                  "X360": "Xbox 360", "XB": "Xbox", "XBL": "Xbox Live", "XOne": "Xbox One"}, \
                                 inplace=True)

In [78]:
sales_joined.groupby(['platform']).size()

platform
Atari 2600               133
Game Boy                  94
Game Boy Advance         937
Game Boy Color             9
GameCube                 564
NES                      103
Neo Geo                   12
Nintendo 3DS             646
Nintendo 64              322
Nintendo DS             2511
Nintendo Switch          298
Nintendo Wii            1437
Nintendo Wii U           170
PC                      2786
PlayStation             1211
PlayStation 2           2247
PlayStation 3           1412
PlayStation 4            957
PlayStation Network       16
PlayStation Portable    1401
PlayStation Vita         687
SNES                     242
Sega Dreamcast            52
Sega Genesis              31
Sega Saturn              175
Xbox                     850
Xbox 360                1350
Xbox Live                 10
Xbox One                 546
dtype: int64

In [79]:
sales_joined

Unnamed: 0,rank,name,genre,esrb_rating,platform,publisher,developer,global_sales,na_sales,pal_sales,jp_sales,other_sales,year,critic_score,user_score
0,1,Wii Sports,Sports,E,Nintendo Wii,Nintendo,Nintendo EAD,82.86,,,,,2006,76.0,8.0
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,40.24,,,,,1985,,
2,3,Mario Kart Wii,Racing,E,Nintendo Wii,Nintendo,Nintendo EAD,37.14,,,,,2008,82.0,8.3
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,36.60,,,,,2017,,
4,5,Wii Sports Resort,Sports,E,Nintendo Wii,Nintendo,Nintendo EAD,33.09,,,,,2009,80.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21240,21238,"Mind, Body & Soul: Blend It!",Puzzle,,Nintendo DS,505 Games,Crush Digital,0.00,,0.0,,,2010,,
21241,21239,Ride! Carnival Tycoon,Strategy,E,PC,ValuSoft,ValuSoft,0.00,,0.0,,,2007,,
21242,21240,Shinobido: Tales of the Ninja,Action,,PlayStation Portable,Sony Computer Entertainment,Acquire,0.00,,0.0,,,2007,,
21243,21241,Tactical Ops: Assault on Terror,Shooter,,PC,Atari,Kamehan Studios,0.00,,0.0,,,2002,,


In [80]:
# remove games with no recorded sales
sales_joined = sales_joined.loc[sales_joined["global_sales"] > 0]

In [81]:
# create new variable for plotting (jp: joined/plotting)
sales_jp = sales_joined.copy()

In [82]:
# remove certain games that were bundled with consoles
sales_jp = sales_jp.loc[sales_jp["name"] != "Wii Sports"]
sales_jp = sales_jp.loc[sales_jp["name"] != "Super Mario Bros."]
sales_jp = sales_jp.loc[sales_jp["name"] != "Duck Hunt"]
sales_jp = sales_jp.loc[sales_jp["name"] != "Kinect Adventures!"]

In [83]:
# remove three rows where year is 0
sales_jp = sales_jp.loc[sales_jp["year"] != 0]

In [84]:
# restrict year range
sales_jp = sales_jp.loc[sales_jp["year"] <= 2018]
sales_jp = sales_jp.loc[sales_jp["year"] >= 1985]

In [85]:
# remove games with no recorded sales
sales_xbox = sales_xbox.loc[sales_xbox["global"] > 0]

In [86]:
sales_xbox.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 423 entries, 0 to 422
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pos            423 non-null    int64  
 1   game           423 non-null    object 
 2   year           422 non-null    float64
 3   genre          423 non-null    object 
 4   publisher      422 non-null    object 
 5   north_america  423 non-null    float64
 6   europe         423 non-null    float64
 7   japan          423 non-null    float64
 8   rest_of_world  423 non-null    float64
 9   global         423 non-null    float64
dtypes: float64(6), int64(1), object(3)
memory usage: 36.4+ KB


In [87]:
sales_xbox.dtypes

pos                int64
game              object
year             float64
genre             object
publisher         object
north_america    float64
europe           float64
japan            float64
rest_of_world    float64
global           float64
dtype: object

In [88]:
sales_xbox

Unnamed: 0,pos,game,year,genre,publisher,north_america,europe,japan,rest_of_world,global
0,1,Grand Theft Auto V,2014.0,Action,Rockstar Games,4.70,3.25,0.01,0.76,8.72
1,2,Call of Duty: Black Ops 3,2015.0,Shooter,Activision,4.63,2.04,0.02,0.68,7.37
2,3,Call of Duty: WWII,2017.0,Shooter,Activision,3.75,1.91,0.00,0.57,6.23
3,4,Red Dead Redemption 2,2018.0,Action-Adventure,Rockstar Games,3.76,1.47,0.00,0.54,5.77
4,5,MineCraft,2014.0,Misc,Microsoft Studios,3.23,1.71,0.00,0.49,5.43
...,...,...,...,...,...,...,...,...,...,...
418,419,ScreamRide,2015.0,Simulation,Microsoft Studios,0.00,0.01,0.00,0.00,0.01
419,420,Torment: Tides of Numenera,2017.0,Role-Playing,Techland,0.01,0.00,0.00,0.00,0.01
420,421,Ziggurat,2016.0,Shooter,Soedesco,0.00,0.01,0.00,0.00,0.01
421,422,ATV Renegades,2017.0,Racing,Nighthawk Interactive,0.00,0.00,0.00,0.00,0.01


In [89]:
sales_ps4

Unnamed: 0,game,year,genre,publisher,north_america,europe,japan,rest_of_world,global
0,Grand Theft Auto V,2014.0,Action,Rockstar Games,6.06,9.71,0.60,3.02,19.39
1,Call of Duty: Black Ops 3,2015.0,Shooter,Activision,6.18,6.05,0.41,2.44,15.09
2,Red Dead Redemption 2,2018.0,Action-Adventure,Rockstar Games,5.26,6.21,0.21,2.26,13.94
3,Call of Duty: WWII,2017.0,Shooter,Activision,4.67,6.21,0.40,2.12,13.40
4,FIFA 18,2017.0,Sports,EA Sports,1.27,8.64,0.15,1.73,11.80
...,...,...,...,...,...,...,...,...,...
1029,Fallen Legion: Flames of Rebellion,,Role-Playing,,0.00,0.00,0.00,0.00,0.00
1030,Radial G Racing Revolved,2017.0,Racing,Tammeka Games,0.00,0.00,0.00,0.00,0.00
1031,The Mummy Demastered,,Action,,0.00,0.00,0.00,0.00,0.00
1032,Project Nimbus: Code Mirai,,Action,,0.00,0.00,0.00,0.00,0.00


In [90]:
# remove games with no recorded sales
sales_ps4 = sales_ps4.loc[sales_ps4["global"] > 0]

In [91]:
sales_ps4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 711
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   game           712 non-null    object 
 1   year           709 non-null    float64
 2   genre          712 non-null    object 
 3   publisher      709 non-null    object 
 4   north_america  712 non-null    float64
 5   europe         712 non-null    float64
 6   japan          712 non-null    float64
 7   rest_of_world  712 non-null    float64
 8   global         712 non-null    float64
dtypes: float64(6), object(3)
memory usage: 55.6+ KB


In [92]:
# write new csv files
sales_joined.to_csv(here("./clean_data/sales-joined.csv"), index=False)
sales_jp.to_csv(here("./clean_data/sales-jp.csv"), index=False)