In [2]:
import pandas as pd
import numpy as np
from janitor import clean_names
from pyprojroot import here

In [3]:
sales_2019 = pd.read_csv(here("./raw_data/sales-2019.csv")).clean_names()
sales_2016 = pd.read_csv(here("./raw_data/sales-2016-with-ratings.csv")).clean_names()
sales_xbox = pd.read_csv(here("./raw_data/xbox-one-game-sales.csv")).clean_names()
sales_ps4 = pd.read_csv(here("./raw_data/ps4-game-sales.csv")).clean_names()

In [12]:
sales_2019.columns

Index(['rank', 'name', 'basename', 'genre', 'esrb_rating', 'platform',
       'publisher', 'developer', 'vgchartz_score', 'critic_score',
       'user_score', 'total_shipped', 'global_sales', 'na_sales', 'pal_sales',
       'jp_sales', 'other_sales', 'year', 'last_update', 'url', 'status',
       'vgchartzscore', 'img_url'],
      dtype='object')

In [13]:
sales_2019.describe()

Unnamed: 0,rank,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,jp_sales,other_sales,year,status,vgchartzscore
count,55792.0,0.0,6536.0,335.0,1827.0,19415.0,12964.0,13189.0,7043.0,15522.0,54813.0,55792.0,799.0
mean,27896.5,,7.213709,8.253433,1.887258,0.365503,0.275541,0.155263,0.110402,0.044719,2005.659095,1.0,7.425907
std,16105.907446,,1.454079,1.401489,4.195693,0.833022,0.512809,0.399257,0.184673,0.129554,8.355585,0.0,1.384226
min,1.0,,1.0,2.0,0.03,0.0,0.0,0.0,0.0,0.0,1970.0,1.0,2.6
25%,13948.75,,6.4,7.8,0.2,0.03,0.05,0.01,0.02,0.0,2000.0,1.0,6.8
50%,27896.5,,7.5,8.5,0.59,0.12,0.12,0.04,0.05,0.01,2008.0,1.0,7.8
75%,41844.25,,8.3,9.1,1.8,0.36,0.29,0.14,0.12,0.04,2011.0,1.0,8.5
max,55792.0,,10.0,10.0,82.86,20.32,9.76,9.85,2.69,3.12,2020.0,1.0,9.6


In [14]:
sales_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55792 entries, 0 to 55791
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rank            55792 non-null  int64  
 1   name            55792 non-null  object 
 2   basename        55792 non-null  object 
 3   genre           55792 non-null  object 
 4   esrb_rating     23623 non-null  object 
 5   platform        55792 non-null  object 
 6   publisher       55792 non-null  object 
 7   developer       55775 non-null  object 
 8   vgchartz_score  0 non-null      float64
 9   critic_score    6536 non-null   float64
 10  user_score      335 non-null    float64
 11  total_shipped   1827 non-null   float64
 12  global_sales    19415 non-null  float64
 13  na_sales        12964 non-null  float64
 14  pal_sales       13189 non-null  float64
 15  jp_sales        7043 non-null   float64
 16  other_sales     15522 non-null  float64
 17  year            54813 non-null 

In [5]:
# drop columns we won't be needing
sales_2019.drop(['vgchartz_score', 'last_update', 'url', 'status', 'vgchartzscore', 'img_url'], axis=1, inplace=True)

In [38]:
sales_2019.isna().sum()

rank                0
name                0
basename            0
genre               0
esrb_rating      6908
platform            0
publisher           0
developer           4
critic_score    16462
user_score      22663
global_sales     1756
na_sales        10034
pal_sales        9809
jp_sales        15955
other_sales      7476
year                0
dtype: int64

In [7]:
# replace NAs in global_sales with corresponding value from total_shipped
sales_2019['global_sales'] = sales_2019['global_sales'].fillna(sales_2019.pop('total_shipped'))

In [17]:
sales_2019.dtypes

rank              int64
name             object
basename         object
genre            object
esrb_rating      object
platform         object
publisher        object
developer        object
critic_score    float64
user_score      float64
global_sales    float64
na_sales        float64
pal_sales       float64
jp_sales        float64
other_sales     float64
year              int64
dtype: object

In [14]:
# replace NAs in year column with 0
sales_2019['year'] = sales_2019['year'].fillna(0)

In [15]:
# convert year column from float to integer
sales_2019["year"] = sales_2019["year"].astype(int)

In [33]:
# drop all rows where critic_score through other_sales are all NaN
sales_2019.dropna(subset=sales_2019.columns[8:14], how='all', inplace=True)

In [36]:
sales_2019.loc[sales_2019["global_sales"] >= 0.1]

Unnamed: 0,rank,name,basename,genre,esrb_rating,platform,publisher,developer,critic_score,user_score,global_sales,na_sales,pal_sales,jp_sales,other_sales,year
0,1,Wii Sports,wii-sports,Sports,E,Wii,Nintendo,Nintendo EAD,7.7,,82.86,,,,,2006
1,2,Super Mario Bros.,super-mario-bros,Platform,,NES,Nintendo,Nintendo EAD,10.0,,40.24,,,,,1985
2,3,Mario Kart Wii,mario-kart-wii,Racing,E,Wii,Nintendo,Nintendo EAD,8.2,9.1,37.14,,,,,2008
3,4,PlayerUnknown's Battlegrounds,playerunknowns-battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,36.60,,,,,2017
4,5,Wii Sports Resort,wii-sports-resort,Sports,E,Wii,Nintendo,Nintendo EAD,8.0,8.8,33.09,,,,,2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12637,12638,Slime Rancher,slime-rancher,Simulation,E10,XOne,Skybound Games,Monomi Park,,,0.10,0.08,,,0.01,2018
12638,12639,thinkSMART,thinksmart,Misc,E,DS,Conspiracy Entertainment,Conspiracy Entertainment,,,0.10,0.09,,,0.01,2010
12639,12640,Disney Planes Fire & Rescue,disney-planes-fire-amp-rescue,Action,E,3DS,Disney Interactive Studios,Little Orbit,,,0.10,0.01,0.08,,0.01,2014
12640,12641,Lost Dimension,lost-dimension,Role-Playing,T,PSV,Atlus,Lancarse,,,0.10,0.04,0.02,0.02,0.02,2015


In [37]:
sales_2019.platform.unique()

array(['Wii', 'NES', 'PC', 'GB', 'DS', 'X360', 'SNES', 'PS3', 'PS4',
       '3DS', 'PS2', 'GBA', 'NS', 'GEN', 'N64', 'PS', 'XOne', 'WiiU',
       'XB', 'PSP', '2600', 'GC', 'GBC', 'PSN', 'PSV', 'DC', 'SAT', 'SCD',
       'WS', 'XBL', 'Amig', 'VC', 'NG', 'WW', 'PCE', '3DO', 'GG', 'OSX',
       'PCFX', 'Mob', 'DSiW', 'AJ', 'NGage', 'Linux', 'iOS'], dtype=object)

In [39]:
sales_2016

Unnamed: 0,name,platform,year_of_release,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales,critic_score,critic_count,user_score,user_count,developer,rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,,
