## Regression to multiple outputs 

Dataset: https://www.kaggle.com/datasets/gregorut/videogamesales

In [1]:
import pandas as pd
import tensorflow as tf 
import sklearn

In [2]:
pd.__version__, tf.__version__, sklearn.__version__

('2.2.2', '2.17.0', '1.5.1')

In [3]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [4]:
data = pd.read_csv('../data/games/games.csv')
data

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,,


In [5]:
data = data.drop('Other_Sales', axis=1)
data = data.drop('Global_Sales', axis=1)
data = data.drop('Developer', axis=1)

In [6]:
data.shape

(16719, 13)

In [7]:
data.isnull().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Rating             6769
dtype: int64

In [8]:
# we will delete registers (rows) with missing values (we could use other techniques that are ideal)

data = data.dropna(axis = 0) 
data.shape

(6825, 13)

In [9]:
data.isnull().sum()

Name               0
Platform           0
Year_of_Release    0
Genre              0
Publisher          0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Critic_Score       0
Critic_Count       0
User_Score         0
User_Count         0
Rating             0
dtype: int64

In [10]:
data['Name'].value_counts()

Name
Need for Speed: Most Wanted                  8
Madden NFL 07                                8
LEGO Star Wars II: The Original Trilogy      8
The Sims 2                                   7
Terraria                                     7
                                            ..
Castlevania: Portrait of Ruin                1
Suzuki TT Superbikes                         1
Rumble Roses                                 1
Sherlock Holmes: The Mystery of the Mummy    1
STORM: Frontline Nation                      1
Name: count, Length: 4377, dtype: int64

In [11]:
# since name of the games are their identifier, they must be unique. It does not make sense that they are multiple

data = data.drop('Name', axis=1)

In [12]:
data.columns

Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count', 'Rating'],
      dtype='object')

In [13]:
X = data.iloc[:, [0, 1,2 ,3, 7, 8, 9, 10, 11]].values
X

array([['Wii', 2006.0, 'Sports', ..., '8', 322.0, 'E'],
       ['Wii', 2008.0, 'Racing', ..., '8.3', 709.0, 'E'],
       ['Wii', 2009.0, 'Sports', ..., '8', 192.0, 'E'],
       ...,
       ['PC', 2014.0, 'Action', ..., '7.6', 412.0, 'M'],
       ['PC', 2011.0, 'Shooter', ..., '5.8', 43.0, 'T'],
       ['PC', 2011.0, 'Strategy', ..., '7.2', 13.0, 'E10+']], dtype=object)

In [14]:
y_na = data.iloc[:,4].values
y_eu = data.iloc[:,5].values
y_jp = data.iloc[:,6].values


In [15]:
y_na

array([4.136e+01, 1.568e+01, 1.561e+01, ..., 0.000e+00, 1.000e-02,
       0.000e+00])

In [16]:
y_eu

array([2.896e+01, 1.276e+01, 1.093e+01, ..., 1.000e-02, 0.000e+00,
       1.000e-02])

In [17]:
y_jp

array([3.77, 3.79, 3.28, ..., 0.  , 0.  , 0.  ])

In [18]:
data['Platform'].value_counts()

Platform
PS2     1140
X360     858
PS3      769
PC       651
XB       565
Wii      479
DS       464
PSP      390
GC       348
PS4      239
GBA      237
XOne     159
3DS      155
PS       150
PSV      118
WiiU      89
DC        14
Name: count, dtype: int64

In [19]:
data.columns

Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count', 'Rating'],
      dtype='object')

In [20]:
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(),[0, 2, 3, 8])], remainder='passthrough') 
X = onehotencoder.fit_transform(X)
X.shape

(6825, 303)

In [21]:
input_layer = Input(shape = (303,))
hidden_layer1 = Dense(units=153, activation='relu')(input_layer)
hidden_layer2 = Dense(units=153, activation='relu')(hidden_layer1)
output_layer1 = Dense(units=1, activation='linear')(hidden_layer2)
output_layer2 = Dense(units=1, activation='linear')(hidden_layer2)
output_layer3 = Dense(units=1, activation='linear')(hidden_layer2)

In [22]:
regressor = Model(inputs=input_layer, outputs=[output_layer1, output_layer2, output_layer3])

In [23]:
regressor.compile(optimizer='adam', loss='mse')

In [24]:
regressor.summary()

In [25]:
regressor.fit(X, [y_na, y_eu, y_jp], epochs=500, batch_size=100)

Epoch 1/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - dense_2_loss: 194.5048 - dense_3_loss: 5536.1221 - dense_4_loss: 4628.2891 - loss: 10359.9561
Epoch 2/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - dense_2_loss: 2.4333 - dense_3_loss: 7.0953 - dense_4_loss: 6.3736 - loss: 15.9047
Epoch 3/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_2_loss: 2.9654 - dense_3_loss: 2.2543 - dense_4_loss: 1.6982 - loss: 6.9192
Epoch 4/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - dense_2_loss: 8.0822 - dense_3_loss: 11.9110 - dense_4_loss: 6.8476 - loss: 26.8463
Epoch 5/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - dense_2_loss: 1.8006 - dense_3_loss: 1.1009 - dense_4_loss: 0.8674 - loss: 3.7687
Epoch 6/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - dense_2_loss: 2.3047 - dense_3_loss: 2.6825 - dense_4

<keras.src.callbacks.history.History at 0x21cd4fd7d10>

In [26]:
predict_na, predict_eu, predict_jp = regressor.predict(X)

[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [27]:
predict_na

array([[ 1.3496561 ],
       [ 2.222465  ],
       [ 1.3055909 ],
       ...,
       [-0.02393872],
       [-0.3098631 ],
       [-0.56400853]], dtype=float32)

In [28]:
y_na

array([4.136e+01, 1.568e+01, 1.561e+01, ..., 0.000e+00, 1.000e-02,
       0.000e+00])

In [29]:
predict_na.mean(), y_na.mean()

(0.34872016, 0.3944835164835165)

In [30]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_na, predict_na)

0.2957715604467706

In [31]:
predict_eu.mean(), y_eu.mean()

(0.22174335, 0.23608937728937732)

In [32]:
mean_absolute_error(y_eu, predict_eu)

0.21413124408940254

In [34]:
predict_jp.mean(), y_jp.mean()

(0.038614474, 0.06415824175824175)

In [35]:
mean_absolute_error(y_jp, predict_jp)

0.09245893549582897