## Predict the Global Sales in Video Games using Sklearn

Import all the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import random

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

Read the data from the csv file

In [2]:
data = pd.read_csv("../vgsales.csv")

Get the info of the data

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16327 non-null float64
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


#### Check if there are nan values

In [4]:
data.isnull().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

Year feature has 271 nan values and Publisher feature has 58 nan values

In [5]:
year = data.Year.unique() #Get array of unique values of year
year_list=[] 
for i in year:
    if not np.isnan(i):
        year_list.append(i) #If the value in the array is not nan append it to year_list

To replace nan values with random year

In [6]:
year_length=len(data['Year'])        #Get the no. of instances 
for i in range(year_length):
    if str(data['Year'][i]) == ('nan' or ' nan'):
        a = random.choice(year_list) #get a random value from the year list 
        data['Year'][i] = a          #Replace it with random values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [7]:
publisher=(data.Publisher.unique()) #Get the array of unique values of publisher
publisher_list=[] 
for i in publisher:  
    if not pd.isnull(i):
        publisher_list.append(i) #Append non nan values to publisher_list

To replace nan values with random publisher

In [8]:
publisher_length=len(data['Publisher'])  #Get the no. of instances 
for i in range(publisher_length):
    if str(data['Publisher'][i]) == ('nan' or ' nan'):
        a = random.choice(publisher_list) #get a random value from the publisher list 
        data['Publisher'][i] = a          #Replace it with random values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [9]:
data.isnull().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16598 non-null float64
Genre           16598 non-null object
Publisher       16598 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


##### By visuallizing, the features year,genre,platform and publisher largely influences the Global Sales ,hence these features are considered

Converting the string columns into integer values using get_dummies

In [11]:
x=pd.get_dummies(data['Genre']) #convert string values to int for genre -> generates 12 col
x.head()

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,0


In [12]:
y=pd.get_dummies(data['Publisher']) #convert string values to int for publisher -> generates 576 col
y.head()

Unnamed: 0,10TACLE Studios,1C Company,20th Century Fox Video Games,2D Boy,3DO,49Games,505 Games,5pb,7G//AMES,989 Sports,...,Zushi Games,bitComposer Games,dramatic create,fonfun,iWin,id Software,imageepoch Inc.,inXile Entertainment,"mixi, Inc",responDESIGN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
z=pd.get_dummies(data['Platform']) #convert string values to int for platform -> generates 31 col
z.head()

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,SAT,SCD,SNES,TG16,WS,Wii,WiiU,X360,XB,XOne
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Convert the additional columns into a single column using PCA.

PCA helps in converting a large dataset into a smaller one, that still contains the same amount of information of the large dataset.This technique helps us reduce the larger dimensions of the data into a small one. 

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)  #reduce to 1 column

features1 = pca.fit_transform(x) #Fit the model with x(Genre) and apply the dimensionality reduction on x.
features1=pd.DataFrame(features1,columns=['genre']) #Create a genre column

features2 = pca.fit_transform(y) #Fit the model with y(publisher) and apply the dimensionality reduction on y.
features2=pd.DataFrame(features2,columns=['publisher']) #Create a publisher column

features3 = pca.fit_transform(z)#Fit the model with z(platform) and apply the dimensionality reduction on z.
features3=pd.DataFrame(features3,columns=['platform']) #Create a platform column

In [15]:
data.Year = data.Year.astype(int) #Convert year to int from float
df=data.drop(['Genre','Publisher','Platform'],axis=1)  #Drop the existing Genre ,publisher,platform columns 
dataset=pd.concat([df,features1,features2,features3],axis=1) #Concate the 3 new columns with the data
dataset.head()

Unnamed: 0,Rank,Name,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,genre,publisher,platform
0,1,Wii Sports,2006,41.49,29.02,3.77,8.46,82.74,-0.425423,-0.0953,-0.000845
1,2,Super Mario Bros.,1985,29.08,3.58,6.81,0.77,40.24,-0.129749,-0.0953,-0.000342
2,3,Mario Kart Wii,2008,15.85,12.88,3.79,3.31,35.82,-0.156853,-0.0953,-0.000845
3,4,Wii Sports Resort,2009,15.75,11.01,3.28,2.96,33.0,-0.425423,-0.0953,-0.000845
4,5,Pokemon Red/Pokemon Blue,1996,11.27,8.89,10.22,1.0,31.37,-0.181867,-0.0953,-0.000342


### Converting the required features and the target into numpy array

In [30]:
x = np.array(dataset[['genre','publisher','platform','Year']])
y = np.array(dataset['Global_Sales'])

### Convert y into 2D array

In [31]:
y=y.reshape(-1,1)

### Normalize the array

In [32]:
scaler=MinMaxScaler()

In [33]:
X=scaler.fit_transform(x)
Y=scaler.fit_transform(y)

### Split the data into train and test values

In [34]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=42)

### Using LinearRegression model of SKlearn

In [35]:
regressor = LinearRegression()

### Train the model

In [36]:
regressor.fit(xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

### Predict the output for the test values

In [37]:
y_pred =regressor.predict(xtest)

In [38]:
y_pred

array([[0.00461187],
       [0.00628227],
       [0.00683467],
       ...,
       [0.00530601],
       [0.00714737],
       [0.00479774]])

In [39]:
np.sqrt(mean_squared_error(ytest,y_pred))

0.021636872824418896

### RMSE value is 0.0216