In [6]:
# Importing the required libraries
import pandas as pd
import numpy as np

# Importing the dataset
dataset = pd.read_csv('Video_Games_Sales_as_at_22_Dec_2016.csv')

# Dropping certain less important features
dataset.drop(columns=['Year_of_Release', 'Developer', 'Publisher',
             'Platform'], inplace=True)  # Add year_of_release

# To view the columns with missing values
print('Feature name || Total missing values')
print(dataset.isna().sum())

Feature name || Total missing values
Name               2
Genre              2
NA_Sales           0
EU_Sales           0
JP_Sales           0
Other_Sales        0
Global_Sales       0
Critic_Score    8582
Critic_Count    8582
User_Score      9129
User_Count      9129
Rating          6769
dtype: int64


In [7]:
X = dataset.iloc[:, :].values
X = np.delete(X, 6, 1)

y = dataset.iloc[:, 6:7].values

# Splitting the dataset into Train and Test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Saving name of the games in training and test set
games_in_training_set = X_train[:, 0]
games_in_test_set = X_test[:, 0]

# Dropping the column that contains the name of the games
X_train = X_train[:, 1:]
X_test = X_test[:, 1:]

In [17]:
X_test

array([[0.0, 0.0, 0.0, ..., 50.0, 9.1, 21.0],
       [1.0, 0.0, 0.0, ..., 17.0, 5.7, 18.0],
       [0.0, 0.0, 0.0, ..., 44.0, 5.9, 27.0],
       ...,
       [0.0, 0.0, 0.0, ..., 26.525275494140285, 7.3, 4.0],
       [0.0, 0.0, 0.0, ..., 19.0, 8.0, 50.0],
       [0.0, 0.0, 1.0, ..., 25.0, 7.5, 66.0]], dtype=object)

In [18]:
X_train

array([[1.0, 0.0, 0.0, ..., 26.525275494140285, 7.127238525206922,
        160.46444695259595],
       [0.0, 0.0, 0.0, ..., 88.0, 8.5, 1184.0],
       [0.0, 0.0, 0.0, ..., 26.525275494140285, 7.127238525206922,
        160.46444695259595],
       ...,
       [0.0, 0.0, 0.0, ..., 18.0, 8.6, 236.0],
       [0.0, 0.0, 0.0, ..., 30.0, 7.7, 43.0],
       [0.0, 0.0, 0.0, ..., 26.525275494140285, 7.127238525206922,
        160.46444695259595]], dtype=object)

In [9]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train[:, [5 ,6, 7, 8]] = imputer.fit_transform(X_train[:, [5, 6, 7, 8]])
X_test[:, [5 ,6, 7, 8]] = imputer.transform(X_test[:, [5, 6, 7, 8]])

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [0, 9])], remainder='passthrough')
X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)


In [13]:
from xgboost import XGBRegressor
model = XGBRegressor(n_estimators = 200, learning_rate= 0.08)
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.08, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [14]:
# Predicting test set results
y_pred = model.predict(X_test)

# Visualising actual and predicted sales
games_in_test_set = games_in_test_set.reshape(-1, 1)
y_pred = y_pred.reshape(-1, 1)
predictions = np.concatenate([games_in_test_set, y_pred, y_test], axis = 1)
predictions = pd.DataFrame(predictions, columns = ['Name', 'Predicted_Global_Sales', 'Actual_Global_Sales'])

In [15]:
predictions

Unnamed: 0,Name,Predicted_Global_Sales,Actual_Global_Sales
0,R-Type Final,0.186028,0.19
1,The Terminator: Dawn of Fate,0.41634,0.41
2,Dead to Rights: Retribution,0.293324,0.28
3,Skylanders SWAP Force,2.169221,2.15
4,DiRT,1.161763,1.05
...,...,...,...
5011,Dynasty Warriors,0.457753,0.51
5012,Transformers: Dark of the Moon,0.171426,0.17
5013,Brunswick Pro Bowling,0.204492,0.21
5014,Valentino Rossi: The Game,0.067381,0.08


In [16]:
from sklearn.metrics import r2_score, mean_squared_error
import math
r2_score = r2_score(y_test, y_pred)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print(f"r2 score of the model : {r2_score:.3f}")
print(f"Root Mean Squared Error of the model : {rmse:.3f}")


r2 score of the model : 0.732
Root Mean Squared Error of the model : 0.743
