In [54]:
import pandas as pd
import tensorflow as tf
import sklearn

In [55]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [56]:
base = pd.read_csv('games.csv')
base

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,,


In [57]:
base = base.drop('Other_Sales', axis = 1)
base = base.drop('Global_Sales', axis = 1)
base = base.drop('Developer', axis = 1)
base.shape

(16719, 13)

## pré-processamento

In [58]:
base.isnull().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Rating             6769
dtype: int64

In [59]:
base = base.dropna(axis=0) # obviamente não é o processo ideal

In [60]:
base.shape

(6825, 13)

In [61]:
base.isnull().sum()

Name               0
Platform           0
Year_of_Release    0
Genre              0
Publisher          0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Critic_Score       0
Critic_Count       0
User_Score         0
User_Count         0
Rating             0
dtype: int64

In [62]:
base['Name'].value_counts()

Name
Need for Speed: Most Wanted                  8
Madden NFL 07                                8
LEGO Star Wars II: The Original Trilogy      8
The Sims 2                                   7
Terraria                                     7
                                            ..
Castlevania: Portrait of Ruin                1
Suzuki TT Superbikes                         1
Rumble Roses                                 1
Sherlock Holmes: The Mystery of the Mummy    1
STORM: Frontline Nation                      1
Name: count, Length: 4377, dtype: int64

In [63]:
base = base.drop('Name', axis = 1)
base.shape

(6825, 12)

In [64]:
base.columns

Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Critic_Score', 'Critic_Count', 'User_Score',
       'User_Count', 'Rating'],
      dtype='object')

In [65]:
x = base.iloc[:, [0, 1, 2, 3, 7, 8, 9, 10, 11]].values # pula a 4, 5 e 6 (sales)
x # atributos previsores

array([['Wii', 2006.0, 'Sports', ..., '8', 322.0, 'E'],
       ['Wii', 2008.0, 'Racing', ..., '8.3', 709.0, 'E'],
       ['Wii', 2009.0, 'Sports', ..., '8', 192.0, 'E'],
       ...,
       ['PC', 2014.0, 'Action', ..., '7.6', 412.0, 'M'],
       ['PC', 2011.0, 'Shooter', ..., '5.8', 43.0, 'T'],
       ['PC', 2011.0, 'Strategy', ..., '7.2', 13.0, 'E10+']], dtype=object)

In [66]:
y_na = base.iloc[:, 4].values
y_eu = base.iloc[:, 5].values
y_jp = base.iloc[:, 6].values

In [67]:
y_na

array([4.136e+01, 1.568e+01, 1.561e+01, ..., 0.000e+00, 1.000e-02,
       0.000e+00])

In [68]:
y_eu

array([2.896e+01, 1.276e+01, 1.093e+01, ..., 1.000e-02, 0.000e+00,
       1.000e-02])

In [69]:
y_jp

array([3.77, 3.79, 3.28, ..., 0.  , 0.  , 0.  ])

In [70]:
# onehotencoder para categorizar os atributos em números
base['Platform'].value_counts() # serão criadas 17 novas colunas
# ps2: 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# x360: 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Platform
PS2     1140
X360     858
PS3      769
PC       651
XB       565
Wii      479
DS       464
PSP      390
GC       348
PS4      239
GBA      237
XOne     159
3DS      155
PS       150
PSV      118
WiiU      89
DC        14
Name: count, dtype: int64

In [71]:
onehotencoder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [0, 2, 3, 8])], remainder='passthrough')
x = onehotencoder.fit_transform(x).toarray()

In [72]:
x.shape # antes eram 12 colunas, agora são 303

(6825, 303)

In [73]:
x[0] # várias colunas representando os atributos que já existiam + os do onehotencoder

array([0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
       0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 

## rede neural

In [74]:
# (303 + 3) / 2 = 153
camada_entrada = Input(shape=(303,))
camada_oculta1 = Dense(units = 153, activation = 'relu')(camada_entrada) # fica conectada assim após pq não ta no sequential
camada_oculta2 = Dense(units = 153, activation = 'relu')(camada_oculta1)
# 3 saídas, mas todas conectadas com a camada oculta 2
camada_saida1 = Dense(units = 1, activation = 'linear')(camada_oculta2) # linear para a regressão
camada_saida2 = Dense(units = 1, activation = 'linear')(camada_oculta2)
camada_saida3 = Dense(units = 1, activation = 'linear')(camada_oculta2)

In [75]:
regressor = Model(
    inputs = camada_entrada,
    outputs = [camada_saida1, camada_saida2, camada_saida3]
)

In [76]:
regressor.compile(optimizer='adam', loss='mse') # mean squared error

In [77]:
regressor.fit(x, [y_na, y_eu, y_jp], epochs = 500, batch_size = 100)

Epoch 1/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - dense_7_loss: 2485.1919 - dense_8_loss: 547.6075 - dense_9_loss: 10.5287 - loss: 3043.6433 
Epoch 2/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_7_loss: 3.3206 - dense_8_loss: 4.0762 - dense_9_loss: 5.1498 - loss: 12.5491
Epoch 3/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_7_loss: 1.2230 - dense_8_loss: 1.3384 - dense_9_loss: 3.6957 - loss: 6.2576
Epoch 4/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_7_loss: 2.2738 - dense_8_loss: 3.5361 - dense_9_loss: 9.3910 - loss: 15.2044
Epoch 5/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_7_loss: 4.0083 - dense_8_loss: 8.2323 - dense_9_loss: 2.9763 - loss: 15.2235
Epoch 6/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - dense_7_loss: 18.0539 - dense_8_loss: 32.4770 - dense_9

<keras.src.callbacks.history.History at 0x1acfa55da00>

In [78]:
previsao_na, previsao_eu, previsao_jp = regressor.predict(x)

[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 960us/step


In [79]:
previsao_na, previsao_na.mean()

(array([[ 3.8203223 ],
        [ 4.779098  ],
        [ 2.9247887 ],
        ...,
        [-0.85312545],
        [ 0.06988311],
        [-0.05569345]], dtype=float32),
 0.47248262)

In [80]:
y_na, y_na.mean()

(array([4.136e+01, 1.568e+01, 1.561e+01, ..., 0.000e+00, 1.000e-02,
        0.000e+00]),
 0.3944835164835165)

In [81]:
from sklearn.metrics import mean_absolute_error

In [83]:
mean_absolute_error(y_na, previsao_na) # quando em na, pode retornar 0.33 para baixo ou para cima

0.33591056508755945

In [84]:
previsao_eu, previsao_eu.mean()

(array([[ 2.3008380e+00],
        [ 3.0241508e+00],
        [ 1.7545130e+00],
        ...,
        [-4.3024844e-01],
        [ 1.9413084e-03],
        [-7.8478411e-02]], dtype=float32),
 0.24857773)

In [85]:
mean_absolute_error(y_eu, previsao_eu)

0.22043360937091455

In [86]:
previsao_jp, previsao_jp.mean()

(array([[ 0.6592042 ],
        [ 0.7338054 ],
        [ 0.5159141 ],
        ...,
        [-0.1620634 ],
        [ 0.01128264],
        [-0.01581056]], dtype=float32),
 0.06069285)

In [87]:
mean_absolute_error(y_jp, previsao_jp)

0.09942758151799332