## Introduction #

In this project, we try to predict video game sales from different features such as genre, publishers and critic score.
By trying different training model, we try to find the best model for sale prediction



In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, median_absolute_error
from sklearn.svm import SVR
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import os
import statistics

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def median_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = statistics.median(y_true), statistics.median(y_pred)
    return abs((y_true - y_pred) / y_true) * 100

# Loading data
csv_path = "data\Video_Games_Sales_as_at_22_Dec_2016.csv"

# Read csv
df = pd.read_csv(csv_path)
# print(df.head())

# using dummy coding to expand the features
df2 = pd.get_dummies(df, columns=['Platform', 'Genre', 'Publisher', 'Developer'])
# df2 = df
# print(df2.head())

# Drop row which has no critic score (NaN)
df3 = df2[(df2.Critic_Score.notnull())]
# print(df3.head())


# shuffle and dropped unused features
df3 = shuffle(df3)
drop_column = ['Name', 'Year_of_Release', 'User_Score', 'User_Count']
df4 = df3.drop(drop_column, 1)
# print(df4.head())
# print(df4.shape)

# standardization of values
scaler = StandardScaler()
df4[['Critic_Score', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']] = scaler.fit_transform(df4[['Critic_Score', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']])

# print(df4.head())

# Predict the global sales
y = df4['Global_Sales']
X = df4.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Rating'], 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

# Model 1: LinearRegression
regr = LinearRegression()
regr.fit(X_train, y_train)
# print(regr.score(X_test, y_test))
y_predict = regr.predict(X_test)
print("r2_score :" + str(r2_score(y_test, y_predict)))
print("mean_absolute_error :" + str(mean_absolute_error(y_test, y_predict)))
print("median_absolute_error :" + str(median_absolute_error(y_test, y_predict)))
print("mean_absolute_percentage_error :" + str(mean_absolute_percentage_error(y_test, y_predict)))
print("median_absolute_percentage_error :" + str(median_absolute_percentage_error(y_test, y_predict)))
# Model 2: Decision tree (Much better)
# Testing max_depth, around 6-8 will be the best

'''
for i in range(3, 100):
    regr_1 = DecisionTreeRegressor(max_depth=i)
    regr_1.fit(X_train, y_train)
    y_predict_1 = regr_1.predict(X_test)
    print("Decision tree with depth" + str(i))
    print(r2_score(y_test, y_predict_1))
    print(mean_absolute_error(y_test, y_predict_1))
    print(mean_absolute_percentage_error(y_test, y_predict_1))
'''

regr_1 = DecisionTreeRegressor(max_depth=6)
regr_1.fit(X_train, y_train)
y_predict_1 = regr_1.predict(X_test)
print("Decision tree with depth 6")
print("r2_score :" + str(r2_score(y_test, y_predict_1)))
print("mean_absolute_error :" + str(mean_absolute_error(y_test, y_predict_1)))
print("median_absolute_error :" + str(median_absolute_error(y_test, y_predict_1)))
print("mean_absolute_error :" + str(mean_absolute_percentage_error(y_test, y_predict_1)))
print("median_absolute_percentage_error :" + str(median_absolute_percentage_error(y_test, y_predict_1)))
'''
# Model 3: Adaboost (Time-consuming)
rng = np.random.RandomState(1)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=50, random_state=rng)
regr_2.fit(X_train, y_train)
y_predict_2 = regr_2.predict(X_test)
print("Decision tree with depth 4")
print(r2_score(y_test, y_predict_2))
print(mean_absolute_error(y_test, y_predict_2))
print(mean_absolute_percentage_error(y_test, y_predict_2))
'''

# Model 4: SVM (Time-consuming) (Best: Poly)
svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_poly = SVR(kernel='poly', C=1e3, degree=2)
y_rbf = svr_rbf.fit(X_train, y_train).predict(X_test)
y_poly = svr_poly.fit(X_train, y_train).predict(X_test)
print("RBF Kernel")
print("r2_score :" + str(r2_score(y_test, y_rbf)))
print("mean mean_absolute_error :" + str(mean_absolute_error(y_test, y_rbf)))
print("median_absolute_error :" + str(median_absolute_error(y_test, y_rbf)))
print("mean_absolute_percentage_error :" + str(mean_absolute_percentage_error(y_test, y_rbf)))
print("median_absolute_percentage_error :" + str(median_absolute_percentage_error(y_test, y_rbf)))
print("Poly Kernel")
print("r2_score :" + str(r2_score(y_test, y_poly)))
print("mean_absolute_error :" + str(mean_absolute_error(y_test, y_poly)))
print("median_absolute_error :" + str(median_absolute_error(y_test, y_poly)))
print("mean_absolute_error :" + str(mean_absolute_percentage_error(y_test, y_poly)))
print("median_absolute_percentage_error :" + str(median_absolute_percentage_error(y_test, y_poly)))

# Model 5: SGDRegressor
clf = SGDRegressor()
clf.fit(X_train, y_train)
y_predict_sgd = clf.predict(X_test)
print("SGDRegressor")
print("r2_score :" + str(r2_score(y_test, y_predict_sgd)))
print("mean_absolute_error :" + str(mean_absolute_error(y_test, y_predict_sgd)))
print("median_absolute_error :" + str(median_absolute_error(y_test, y_predict_sgd)))
print("mean_absolute_error :" + str(mean_absolute_percentage_error(y_test, y_predict_sgd)))
print("median_absolute_percentage_error :" + str(median_absolute_percentage_error(y_test, y_predict_sgd)))


r2_score :-3.86165165838e+15
mean_absolute_error :5465554.63252
median_absolute_error :0.242153632785
mean_absolute_percentage_error :2565835351.01
median_absolute_percentage_error :77.0958015872
Decision tree with depth 6
r2_score :0.180266015023
mean_absolute_error :0.319192947168
median_absolute_error :0.164293610422
mean_absolute_error :216.50025767
median_absolute_percentage_error :26.9806495169
RBF Kernel
r2_score :0.0838527977828
mean mean_absolute_error :0.395314410669
median_absolute_error :0.210548375567
mean_absolute_percentage_error :415.707459601
median_absolute_percentage_error :37.8029388689
Poly Kernel
r2_score :0.214718042346
mean_absolute_error :0.258928042061
median_absolute_error :0.104047361415
mean_absolute_error :205.655972632
median_absolute_percentage_error :16.9266284649




SGDRegressor
r2_score :-1.44472385596e+19
mean_absolute_error :3365163380.89
median_absolute_error :2669242213.01
mean_absolute_error :3.5798067476e+12
median_absolute_percentage_error :550427378388.0


## Result #

After comparing different training model, the SVM with poly kernel performs the best (with a median_absolute_percentage_error around 17%)


In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.svm import SVR
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import os

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def median_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = statistics.median(y_true), statistics.median(y_pred)
    return abs((y_true - y_pred) / y_true) * 100

# Loading data
csv_path = "data\Video_Games_Sales_as_at_22_Dec_2016.csv"

# Read csv
df = pd.read_csv(csv_path)

# print(df.head())
print("\n\n\n")
# Use LabelEncoder to change category into number
# Then use OneHotEncoder(For non-string)/get_dummies(For string) to add column to the dataframe
# Test using platform column
df2 = pd.get_dummies(df, columns=['Platform', 'Genre', 'Publisher', 'Developer'])
# df2 = df
# print(df2.head())

# Drop row which has no critic score (NaN)
df3 = df2[(df2.Critic_Score.notnull())]
# print(df3.head())

df3 = shuffle(df3)
drop_column = ['Name', 'Year_of_Release', 'User_Score', 'User_Count']
df4 = df3.drop(drop_column, 1)
# print(df4.head())
# print(df4.shape)
feature_scaler = StandardScaler()
df4[['Critic_Score']] = feature_scaler.fit_transform(df4[['Critic_Score']])


# print(df4.head())

# Predict the global sales


region_list = ['NA_Sales','EU_Sales','JP_Sales','Other_Sales']

for region in region_list:

    y_scaler = StandardScaler()
    y_scaler.fit(df4[[region]])
    df4[[region]] = y_scaler.transform(df4[[region]])

    y = df4[region]
    X = df4.drop(['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Rating'], 1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

    svr_poly = SVR(kernel='poly', C=1e3, degree=2)
    svr_poly.fit(X_train, y_train)
    predict = svr_poly.predict((X_test.values[0]).reshape(1,-1))

    actual = y_scaler.inverse_transform(y_test.values[0].reshape(1,-1))[0][0]

    predict = y_scaler.inverse_transform(predict)[0]


    print("Prediction of "+region+ " : " + str(predict))
    print("Actual of "+region+ " : " + str(actual))
    # print("error :" + str(np.abs((predict - actual)) * 100 / actual))






Prediction of NA_Sales : 0.0511291483581
Actual of NA_Sales : 0.02
Prediction of EU_Sales : 0.120475348514
Actual of EU_Sales : 0.03
Prediction of JP_Sales : 0.0197422274917
Actual of JP_Sales : 0.0
Prediction of Other_Sales : 0.0301439347881
Actual of Other_Sales : 0.0
