## Predict the Global Sales in Video Games using Linear Regression

Import all the required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import random

Read the data from the csv file

In [2]:
data = pd.read_csv("../vgsales.csv")

Get the info of the data

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
Rank            16598 non-null int64
Name            16598 non-null object
Platform        16598 non-null object
Year            16327 non-null float64
Genre           16598 non-null object
Publisher       16540 non-null object
NA_Sales        16598 non-null float64
EU_Sales        16598 non-null float64
JP_Sales        16598 non-null float64
Other_Sales     16598 non-null float64
Global_Sales    16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


#### Check if there are nan values

In [4]:
data.isnull().sum()

Rank              0
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

Year feature has 271 nan values and Publisher feature has 58 nan values

In [5]:
year = data.Year.unique() #Get array of unique values of year
year_list=[] 
for i in year:
    if not np.isnan(i):
        year_list.append(i) #If the value in the array is not nan append it to year_list

To replace nan values with random year

In [6]:
year_length=len(data['Year'])        #Get the no. of instances 
for i in range(year_length):
    if str(data['Year'][i]) == ('nan' or ' nan'):
        a = random.choice(year_list) #get a random value from the year list 
        data['Year'][i] = a          #Replace it with random values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [7]:
publisher = data.Publisher.unique() #Get array of unique values of publisher
publisher_list=[] 
for i in publisher:   
    if not pd.isnull(i):
        publisher_list.append(i) #Append non nan values to publisher_list

To eliminate nan values with random publisher value

In [8]:
publisher_length=len(data['Publisher'])  #Get the no. of instances 
for i in range(publisher_length):
    if str(data['Publisher'][i]) == ('nan' or ' nan'):
        a = random.choice(publisher_list) #get a random value from the publisher list 
        data['Publisher'][i] = a          #Replace it with random values

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [9]:
data.isnull().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

There is no nan values present in the year as well as publisher

In [10]:
data.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


##### By visuallizing, the features year,genre,platform and publisher largely influences the Global Sales ,hence these features are considered

Converting the string columns into integer values using get_dummies

In [11]:
x=pd.get_dummies(data['Genre']) #convert string values to int for genre -> generates 12 col
x.head()

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,0


In [12]:
y=pd.get_dummies(data['Publisher']) #convert string values to int for publisher -> generates 578 col
y.head()

Unnamed: 0,10TACLE Studios,1C Company,20th Century Fox Video Games,2D Boy,3DO,49Games,505 Games,5pb,7G//AMES,989 Sports,...,Zushi Games,bitComposer Games,dramatic create,fonfun,iWin,id Software,imageepoch Inc.,inXile Entertainment,"mixi, Inc",responDESIGN
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
z=pd.get_dummies(data['Platform']) #convert string values to int for platform -> generates 31 col
z.head()

Unnamed: 0,2600,3DO,3DS,DC,DS,GB,GBA,GC,GEN,GG,...,SAT,SCD,SNES,TG16,WS,Wii,WiiU,X360,XB,XOne
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


##### Convert the additional columns into a single column using PCA.

PCA helps in converting a large dataset into a smaller one, that still contains the same amount of information of the large dataset.This technique helps us reduce the larger dimensions of the data into a small one. 

In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1)  #reduce to 1 column

features1 = pca.fit_transform(x) #Fit the model with x(Genre) and apply the dimensionality reduction on x.
features1=pd.DataFrame(features1,columns=['genre']) #Create a genre column

features2 = pca.fit_transform(y) #Fit the model with y(publisher) and apply the dimensionality reduction on y.
features2=pd.DataFrame(features2,columns=['publisher']) #Create a publisher column

features3 = pca.fit_transform(z) #Fit the model with z(platform) and apply the dimensionality reduction on z.
features3=pd.DataFrame(features3,columns=['platform']) #Create a platform column

In [15]:
data.Year = data.Year.astype(int) #Convert year to int from float
df=data.drop(['Genre','Publisher','Platform'],axis=1)  #Drop the existing Genre ,publisher,platform columns 
dataset=pd.concat([df,features1,features2,features3],axis=1) #Concate the 3 new columns with the data
dataset.head()

Unnamed: 0,Rank,Name,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,genre,publisher,platform
0,1,Wii Sports,2006,41.49,29.02,3.77,8.46,82.74,-0.425423,-0.095332,-0.000845
1,2,Super Mario Bros.,1985,29.08,3.58,6.81,0.77,40.24,-0.129749,-0.095332,-0.000343
2,3,Mario Kart Wii,2008,15.85,12.88,3.79,3.31,35.82,-0.156853,-0.095332,-0.000845
3,4,Wii Sports Resort,2009,15.75,11.01,3.28,2.96,33.0,-0.425423,-0.095332,-0.000845
4,5,Pokemon Red/Pokemon Blue,1996,11.27,8.89,10.22,1.0,31.37,-0.181867,-0.095332,-0.000343


### Considering the required features 

In [25]:
x = dataset[['genre','publisher','platform','Year']]
y = dataset['Global_Sales']

### Normalizing the array

In [26]:
x=((x-x.min())/(x.max()-x.min()))
y=((y-y.min())/(y.max()-y.min()))

### Taking the required features as numpy array

In [27]:
x1= np.array(x['genre'])
x2=np.array(x['publisher'])
x3=np.array(x['platform'])
x4=np.array(x['Year'])

y1= np.array(y)

### Split the data into train and test values

In [28]:
x1train,x1test,x2train,x2test,x3train,x3test,x4train,x4test,ytrain,ytest= train_test_split(x1,x2,x3,x4,y1,test_size=0.3)

### Defining the equation for our model

In [29]:
def hypothesis(a,x1,b,x2,c,x3,d,x4,e):
    return a * x1 +b * x2 + c * x3 + d * x4 + e #returns hypothesis eq for our model

### Defining the loss function or error equation for our model

In [30]:
def error(a,x1,b,x2,c,x3,d,x4,e,y):
    err=0     #holds the error value
    m=len(x1) #how many data_pts are present is stored
    
    for i in range(m):
        err += np.power((hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i]),2)
        #err=summation(h-y)2
        
    return (1/(2*m)) *err #loss fn

### Defining step-gradient in order to reach the minima

In [31]:
def step_gradient(a,x1,b,x2,c,x3,d,x4,e,y,learning_rate):
    grad_a=0
    grad_b=0
    grad_c=0
    grad_d=0
    grad_e=0
    
    m= len(x1)
    
    for i in range(m):
        grad_a += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x1[i] #differentaition wrt a
        grad_b += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x2[i] #diff wrt b
        grad_c += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x3[i] #diff wrt c
        grad_d += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])*x4[i] #diff wrt d
        grad_e += (1/m) * (hypothesis(a,x1[i],b,x2[i],c,x3[i],d,x4[i],e)-y[i])       #diff wrt e
    
    
    a=a-grad_a*learning_rate
    b=b-grad_b*learning_rate
    c=c-grad_c*learning_rate
    d=d-grad_d*learning_rate
    e=e-grad_e*learning_rate

    return a,b,c,d,e

In [32]:
def descend(init_a,x1,init_b,x2,init_c,x3,init_d,x4,init_e,y,learning_rate,iteration):
    a=init_a
    b=init_b
    c=init_c
    d=init_d
    e=init_e
    
    for i in range(iterations): #iterations--> hw many times we step
        err=error(a,x1,b,x2,c,x3,d,x4,e,y)
        
        if i% 1000 == 0:
            print(f"Error: {np.sqrt(err)}, a: {a}, b:{b}, c:{c},d:{d},e:{e}")
            #print error at every 1000 step
            #sqrt of mean error
            
        a,b,c,d,e =step_gradient(a,x1,b,x2,c,x3,d,x4,e,y,learning_rate)
        
    return a,b,c,d,e

In [33]:
a=0
b=0
c=0
d=0
e=0
learning_rate = 0.01
iterations = 10000

In [34]:
final_a, final_b,final_c,final_d,final_e = descend(a,x1train,b,x2train,c,x3train,d,x4train,e,ytrain,learning_rate,iterations)

Error: 0.013774397638902834, a: 0, b:0, c:0,d:0,e:0
Error: 0.013052415811936361, a: 0.00031889386328975375, b:0.001975750539265329, c:0.0006295882865820252,d:0.0011500524472546941,e:0.004617384972082868
Error: 0.013041664252427842, a: 5.931598468866135e-05, b:0.002632193796048584, c:1.4646427026073895e-05,d:0.00017100985942783913,e:0.00557808951221241
Error: 0.013035758121830585, a: -7.777878365041927e-06, b:0.0029283614461508498, c:-0.00032918378838163806,d:-0.0007059669442143553,e:0.006318597994858807
Error: 0.013031912564006395, a: -1.7559682063084542e-05, b:0.003056366379164188, c:-0.0005186872642723467,d:-0.001475217849557977,e:0.00691362700918801
Error: 0.013029222503246242, a: -1.1793802148546561e-05, b:0.003105925230064707, c:-0.0006220121886864739,d:-0.002143114274871257,e:0.007403361062462238
Error: 0.013027287357264012, a: -2.8486446020692898e-06, b:0.003119501190146379, c:-0.00067769561755985,d:-0.0027198293713107894,e:0.007812163547106965
Error: 0.013025880316331272, a: 5.

In [35]:
final_a,final_b,final_c,final_d,final_e

(2.632841659452694e-05,
 0.0030880315051240873,
 -0.000732894425610479,
 -0.004321991456298786,
 0.008905944185403207)

3 test values of target

In [36]:
ytest[:3]

array([0.10165599, 0.00072525, 0.        ])

Comparing the test values with the obtained values

In [37]:
hypothesis(final_a,x1test[:3],final_b,x2test[:3],final_c,x3test[:3],final_d,x4test[:3],final_e)

array([0.00585723, 0.00697629, 0.00575041])

In [38]:
np.sqrt(error(final_a,x1train,final_b,x2train,final_c,x3train,final_d,x4train,final_e,ytrain))

0.013023552311514875

In [39]:
np.sqrt(error(final_a,x1test,final_b,x2test,final_c,x3test,final_d,x4train,final_e,ytest))

0.013813121794282178

### RMSE value for train data is 0.0130
### RMSE value for test data is 0.0138