In [2]:
import pandas as pd
import numpy as np
from janitor import clean_names
from pyprojroot import here

In [70]:
sales_2019 = pd.read_csv(here("./raw_data/sales-2019.csv")).clean_names()
sales_2016 = pd.read_csv(here("./raw_data/sales-2016-with-ratings.csv")).clean_names()
sales_xbox = pd.read_csv(here("./raw_data/xbox-one-game-sales.csv")).clean_names()
sales_ps4 = pd.read_csv(here("./raw_data/ps4-game-sales.csv")).clean_names()

In [12]:
sales_2019.columns

Index(['rank', 'name', 'basename', 'genre', 'esrb_rating', 'platform',
       'publisher', 'developer', 'vgchartz_score', 'critic_score',
       'user_score', 'total_shipped', 'global_sales', 'na_sales', 'pal_sales',
       'jp_sales', 'other_sales', 'year', 'last_update', 'url', 'status',
       'vgchartzscore', 'img_url'],
      dtype='object')

In [13]:
sales_2019.describe()

Unnamed: 0,rank,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,jp_sales,other_sales,year,status,vgchartzscore
count,55792.0,0.0,6536.0,335.0,1827.0,19415.0,12964.0,13189.0,7043.0,15522.0,54813.0,55792.0,799.0
mean,27896.5,,7.213709,8.253433,1.887258,0.365503,0.275541,0.155263,0.110402,0.044719,2005.659095,1.0,7.425907
std,16105.907446,,1.454079,1.401489,4.195693,0.833022,0.512809,0.399257,0.184673,0.129554,8.355585,0.0,1.384226
min,1.0,,1.0,2.0,0.03,0.0,0.0,0.0,0.0,0.0,1970.0,1.0,2.6
25%,13948.75,,6.4,7.8,0.2,0.03,0.05,0.01,0.02,0.0,2000.0,1.0,6.8
50%,27896.5,,7.5,8.5,0.59,0.12,0.12,0.04,0.05,0.01,2008.0,1.0,7.8
75%,41844.25,,8.3,9.1,1.8,0.36,0.29,0.14,0.12,0.04,2011.0,1.0,8.5
max,55792.0,,10.0,10.0,82.86,20.32,9.76,9.85,2.69,3.12,2020.0,1.0,9.6


In [14]:
sales_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55792 entries, 0 to 55791
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rank            55792 non-null  int64  
 1   name            55792 non-null  object 
 2   basename        55792 non-null  object 
 3   genre           55792 non-null  object 
 4   esrb_rating     23623 non-null  object 
 5   platform        55792 non-null  object 
 6   publisher       55792 non-null  object 
 7   developer       55775 non-null  object 
 8   vgchartz_score  0 non-null      float64
 9   critic_score    6536 non-null   float64
 10  user_score      335 non-null    float64
 11  total_shipped   1827 non-null   float64
 12  global_sales    19415 non-null  float64
 13  na_sales        12964 non-null  float64
 14  pal_sales       13189 non-null  float64
 15  jp_sales        7043 non-null   float64
 16  other_sales     15522 non-null  float64
 17  year            54813 non-null 

In [71]:
# drop columns we won't be needing
sales_2019.drop(['basename', 'vgchartz_score', 'critic_score', 'user_score', 'last_update', 'url', 'status', \
                 'vgchartzscore', 'img_url'], axis=1, inplace=True)

In [64]:
sales_2019.isna().sum()

rank                 0
name                 0
genre                0
esrb_rating      32169
platform             0
publisher            0
developer           17
total_shipped    53965
global_sales     36377
na_sales         42828
pal_sales        42603
jp_sales         48749
other_sales      40270
year               979
dtype: int64

In [72]:
# replace NAs in global_sales with corresponding value from total_shipped
sales_2019['global_sales'] = sales_2019['global_sales'].fillna(sales_2019.pop('total_shipped'))

In [17]:
sales_2019.dtypes

rank              int64
name             object
basename         object
genre            object
esrb_rating      object
platform         object
publisher        object
developer        object
critic_score    float64
user_score      float64
global_sales    float64
na_sales        float64
pal_sales       float64
jp_sales        float64
other_sales     float64
year              int64
dtype: object

In [73]:
# replace NAs in year column with 0
sales_2019['year'] = sales_2019['year'].fillna(0)

In [74]:
# convert year column from float to integer
sales_2019["year"] = sales_2019["year"].astype(int)

In [78]:
# drop all rows where global_sales through other_sales are all NaN
sales_2019.dropna(subset=sales_2019.columns[7:11], how='all', inplace=True)

In [36]:
sales_2019.loc[sales_2019["global_sales"] >= 0.1]

Unnamed: 0,rank,name,basename,genre,esrb_rating,platform,publisher,developer,critic_score,user_score,global_sales,na_sales,pal_sales,jp_sales,other_sales,year
0,1,Wii Sports,wii-sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,82.86,,,,,2006
1,2,Super Mario Bros.,super-mario-bros,Platform,,NES,Nintendo,Nintendo EAD,10.0,,40.24,,,,,1985
2,3,Mario Kart Wii,mario-kart-wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,37.14,,,,,2008
3,4,PlayerUnknown's Battlegrounds,playerunknowns-battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,36.60,,,,,2017
4,5,Wii Sports Resort,wii-sports-resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,33.09,,,,,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12637,12638,Slime Rancher,slime-rancher,Simulation,E10,XOne,Skybound Games,Monomi Park,,,0.10,0.08,,,0.01,2018
12638,12639,thinkSMART,thinksmart,Misc,E,DS,Conspiracy Entertainment,Conspiracy Entertainment,,,0.10,0.09,,,0.01,2010
12639,12640,Disney Planes Fire & Rescue,disney-planes-fire-amp-rescue,Action,E,3DS,Disney Interactive Studios,Little Orbit,,,0.10,0.01,0.08,,0.01,2014
12640,12641,Lost Dimension,lost-dimension,Role-Playing,T,PSV,Atlus,Lancarse,,,0.10,0.04,0.02,0.02,0.02,2015


In [37]:
sales_2019.platform.unique()

array(['Wii', 'NES', 'PC', 'GB', 'DS', 'X360', 'SNES', 'PS3', 'PS4',
       '3DS', 'PS2', 'GBA', 'NS', 'GEN', 'N64', 'PS', 'XOne', 'WiiU',
       'XB', 'PSP', '2600', 'GC', 'GBC', 'PSN', 'PSV', 'DC', 'SAT', 'SCD',
       'WS', 'XBL', 'Amig', 'VC', 'NG', 'WW', 'PCE', '3DO', 'GG', 'OSX',
       'PCFX', 'Mob', 'DSiW', 'AJ', 'NGage', 'Linux', 'iOS'], dtype=object)

In [108]:
sales_2016 = pd.read_csv(here("./raw_data/sales-2016-with-ratings.csv")).clean_names()

In [109]:
# replace NAs in year column with 0
sales_2016['year_of_release'] = sales_2016['year_of_release'].fillna(0)

In [110]:
# convert year_of_release column from float to integer
sales_2016["year_of_release"] = sales_2016["year_of_release"].astype(int)

In [111]:
# drop all rows where critic_score through user_count are all NaN
sales_2016.dropna(subset=sales_2016.columns[10:14], how='all', inplace=True)

In [113]:
# drop columns we won't be needing
sales_2016 = sales_2016.loc[:, ['name', 'platform', 'critic_score', 'user_score']]

In [114]:
# convert user_score from object to float in order to turn all "tbd" into NaN
sales_2016['user_score'] = pd.to_numeric(sales_2016['user_score'],errors = 'coerce')

In [117]:
# drop all rows where both critic_score and user_score are NaN
sales_2016.dropna(subset=sales_2016.columns[2:4], how='all', inplace=True)

In [118]:
sales_2016

Unnamed: 0,name,platform,critic_score,user_score
0,Wii Sports,Wii,76.0,8.0
2,Mario Kart Wii,Wii,82.0,8.3
3,Wii Sports Resort,Wii,80.0,8.0
6,New Super Mario Bros.,DS,89.0,8.5
7,Wii Play,Wii,58.0,6.6
...,...,...,...,...
16700,Breach,PC,61.0,5.8
16701,Bust-A-Move 3000,GC,53.0,
16702,Mega Brain Boost,DS,48.0,
16706,STORM: Frontline Nation,PC,60.0,7.2


In [106]:
sales_2019

Unnamed: 0,rank,name,genre,esrb_rating,platform,publisher,developer,global_sales,na_sales,pal_sales,jp_sales,other_sales,year
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,82.86,,,,,2006
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,40.24,,,,,1985
2,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,37.14,,,,,2008
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,36.60,,,,,2017
4,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,33.09,,,,,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21237,21238,"Mind, Body & Soul: Blend It!",Puzzle,,DS,505 Games,Crush Digital,0.00,,0.0,,,2010
21238,21239,Ride! Carnival Tycoon,Strategy,E,PC,ValuSoft,ValuSoft,0.00,,0.0,,,2007
21239,21240,Shinobido: Tales of the Ninja,Action,,PSP,Sony Computer Entertainment,Acquire,0.00,,0.0,,,2007
21240,21241,Tactical Ops: Assault on Terror,Shooter,,PC,Atari,Kamehan Studios,0.00,,0.0,,,2002


In [119]:
sales_2019.merge(sales_2016, how='left', on='name')

Unnamed: 0,rank,name,genre,esrb_rating,platform_x,publisher,developer,global_sales,na_sales,pal_sales,jp_sales,other_sales,year,platform_y,critic_score,user_score
0,1,Wii Sports,Sports,E,Wii,Nintendo,Nintendo EAD,82.86,,,,,2006,Wii,76.0,8.0
1,2,Super Mario Bros.,Platform,,NES,Nintendo,Nintendo EAD,40.24,,,,,1985,,,
2,3,Mario Kart Wii,Racing,E,Wii,Nintendo,Nintendo EAD,37.14,,,,,2008,Wii,82.0,8.3
3,4,PlayerUnknown's Battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,36.60,,,,,2017,,,
4,5,Wii Sports Resort,Sports,E,Wii,Nintendo,Nintendo EAD,33.09,,,,,2009,Wii,80.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34330,21238,"Mind, Body & Soul: Blend It!",Puzzle,,DS,505 Games,Crush Digital,0.00,,0.0,,,2010,,,
34331,21239,Ride! Carnival Tycoon,Strategy,E,PC,ValuSoft,ValuSoft,0.00,,0.0,,,2007,,,
34332,21240,Shinobido: Tales of the Ninja,Action,,PSP,Sony Computer Entertainment,Acquire,0.00,,0.0,,,2007,,,
34333,21241,Tactical Ops: Assault on Terror,Shooter,,PC,Atari,Kamehan Studios,0.00,,0.0,,,2002,,,


In [121]:
sales_2016['name'].is_unique

False