In [1]:
import pandas as pd
import tensorflow as tf
import sklearn

In [7]:
from tensorflow.keras.layers import Dense, Dropout, Activation, Input
from tensorflow.keras.models import Model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [13]:
bd = pd.read_csv('games.csv')
bd

Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.36,28.96,3.77,8.45,82.53,76.0,51.0,8,322.0,Nintendo,E
1,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24,,,,,,
2,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.68,12.76,3.79,3.29,35.52,82.0,73.0,8.3,709.0,Nintendo,E
3,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.61,10.93,3.28,2.95,32.77,80.0,73.0,8,192.0,Nintendo,E
4,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16714,Samurai Warriors: Sanada Maru,PS3,2016.0,Action,Tecmo Koei,0.00,0.00,0.01,0.00,0.01,,,,,,
16715,LMA Manager 2007,X360,2006.0,Sports,Codemasters,0.00,0.01,0.00,0.00,0.01,,,,,,
16716,Haitaka no Psychedelica,PSV,2016.0,Adventure,Idea Factory,0.00,0.00,0.01,0.00,0.01,,,,,,
16717,Spirits & Spells,GBA,2003.0,Platform,Wanadoo,0.01,0.00,0.00,0.00,0.01,,,,,,


Tratamento de dados, excluindo colunas que não nos interessa.

In [15]:
bd = bd.drop('Other_Sales', axis = 1)
bd = bd.drop('Global_Sales', axis = 1)
bd = bd.drop('Developer', axis = 1)

In [17]:
bd.shape

(16719, 13)

Quantidade de valores nulos em cada coluna.

In [22]:
bd.isnull().sum()

Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Rating             6769
dtype: int64

In [29]:
bd = bd.dropna(axis = 0)
bd.shape

(6825, 13)

In [31]:
bd.isnull().sum()

Name               0
Platform           0
Year_of_Release    0
Genre              0
Publisher          0
NA_Sales           0
EU_Sales           0
JP_Sales           0
Critic_Score       0
Critic_Count       0
User_Score         0
User_Count         0
Rating             0
dtype: int64

Mostrando que como nome é um valor único ele é dispensavel para a avalição da rede

In [35]:
bd['Name'].value_counts()

Name
Need for Speed: Most Wanted                  8
Madden NFL 07                                8
LEGO Star Wars II: The Original Trilogy      8
The Sims 2                                   7
Terraria                                     7
                                            ..
Castlevania: Portrait of Ruin                1
Suzuki TT Superbikes                         1
Rumble Roses                                 1
Sherlock Holmes: The Mystery of the Mummy    1
STORM: Frontline Nation                      1
Name: count, Length: 4377, dtype: int64

In [40]:
bd = bd.drop('Name', axis = 1)

Criação da base de dados para podermos treinar a rede

In [50]:
X = bd.iloc[:, [0, 1, 2, 3, 7, 8, 9, 10, 11]].values
print(X)

[['Wii' 2006.0 'Sports' ... '8' 322.0 'E']
 ['Wii' 2008.0 'Racing' ... '8.3' 709.0 'E']
 ['Wii' 2009.0 'Sports' ... '8' 192.0 'E']
 ...
 ['PC' 2014.0 'Action' ... '7.6' 412.0 'M']
 ['PC' 2011.0 'Shooter' ... '5.8' 43.0 'T']
 ['PC' 2011.0 'Strategy' ... '7.2' 13.0 'E10+']]


Os valores que queremos que a rede neural chegue perto

In [54]:
y_na = bd.iloc[:, 4].values
y_eu = bd.iloc[:, 5].values
y_jp = bd.iloc[:, 6].values
print(y_na, y_eu, y_jp)

[4.136e+01 1.568e+01 1.561e+01 ... 0.000e+00 1.000e-02 0.000e+00] [2.896e+01 1.276e+01 1.093e+01 ... 1.000e-02 0.000e+00 1.000e-02] [3.77 3.79 3.28 ... 0.   0.   0.  ]


Vai ser criado novas 17 colunas para podermos codificar qual é a plataforma

In [62]:
bd['Platform'].value_counts()

Platform
PS2     1140
X360     858
PS3      769
PC       651
XB       565
Wii      479
DS       464
PSP      390
GC       348
PS4      239
GBA      237
XOne     159
3DS      155
PS       150
PSV      118
WiiU      89
DC        14
Name: count, dtype: int64

Aqui ele vai codificar as colunas (0, 2, 3, 8) colunas que não tem valores númericos.

In [68]:
onehotencoder = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [0,2,3,8])], remainder='passthrough')
X = onehotencoder.fit_transform(X).toarray()

In [72]:
X.shape

(6825, 303)

Criação das camadas de entrada, ocultas e a de saída.

In [76]:
camada_entrada = Input(shape = (303,))
camada_oculta1 = Dense(units = 153, activation = 'relu')(camada_entrada)
camada_oculta2 = Dense(units = 153, activation = 'relu')(camada_oculta1)
camada_saida1 = Dense(units = 1, activation = 'linear')(camada_oculta2)
camada_saida2 = Dense(units = 1, activation = 'linear')(camada_oculta2)
camada_saida3 = Dense(units = 1, activation = 'linear')(camada_oculta2)

Criação da rede neural, compilação e treinamento.

In [80]:
regressor = Model(inputs = camada_entrada, outputs = [camada_saida1, camada_saida2, camada_saida3])

In [82]:
regressor.compile(optimizer = 'adam', loss = 'mse')

In [96]:
regressor.fit(X, [y_na, y_eu, y_jp], epochs = 500, batch_size = 100)

Epoch 1/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - dense_4_loss: 1.0316 - dense_5_loss: 0.5492 - dense_6_loss: 0.3121 - loss: 1.8926
Epoch 2/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - dense_4_loss: 1.1996 - dense_5_loss: 0.5867 - dense_6_loss: 0.1470 - loss: 1.9337
Epoch 3/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - dense_4_loss: 0.6666 - dense_5_loss: 0.3722 - dense_6_loss: 0.1328 - loss: 1.1719
Epoch 4/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - dense_4_loss: 0.5888 - dense_5_loss: 0.2913 - dense_6_loss: 0.1709 - loss: 1.0511
Epoch 5/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - dense_4_loss: 0.9799 - dense_5_loss: 0.5283 - dense_6_loss: 0.2145 - loss: 1.7231
Epoch 6/500
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - dense_4_loss: 0.6120 - dense_5_loss: 0.3483 - dense_6_loss: 0.1221 -

<keras.src.callbacks.history.History at 0x156262c0b90>

Retorno das camadas de saída.

In [104]:
previsao_na, previsao_eu, previsao_jp = regressor.predict(X)

[1m214/214[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  


Comparação das previsões da rede neural com a realidade das vendas na america do norte.

In [106]:
previsao_na, previsao_na.mean()

(array([[25.392494  ],
        [27.000113  ],
        [15.943474  ],
        ...,
        [ 0.3167705 ],
        [-0.12784988],
        [-0.18779796]], dtype=float32),
 0.5309658)

In [108]:
y_na, y_na.mean()

(array([4.136e+01, 1.568e+01, 1.561e+01, ..., 0.000e+00, 1.000e-02,
        0.000e+00]),
 0.3944835164835165)

In [110]:
from sklearn.metrics import mean_absolute_error

Quer dizer que a previsão pode retornar 0,35 para mais ou para menos do valor correto

In [114]:
mean_absolute_error(y_na, previsao_na)

0.3504015702647604

Agora o mesmo comparativo para as outras variaveis, europa e japao.

In [118]:
previsao_eu, previsao_eu.mean()

(array([[18.1199    ],
        [19.224445  ],
        [11.581117  ],
        ...,
        [ 0.22224909],
        [-0.0728009 ],
        [-0.10590321]], dtype=float32),
 0.3278687)

In [120]:
y_eu, y_eu.mean()

(array([2.896e+01, 1.276e+01, 1.093e+01, ..., 1.000e-02, 0.000e+00,
        1.000e-02]),
 0.23608937728937732)

In [122]:
mean_absolute_error(y_eu, previsao_eu)

0.24266700737685942

In [124]:
previsao_jp, previsao_jp.mean()

(array([[ 3.2526405 ],
        [ 4.018067  ],
        [ 3.3534102 ],
        ...,
        [ 0.05653413],
        [-0.02208682],
        [-0.03231622]], dtype=float32),
 0.09611185)

In [126]:
y_jp, y_jp.mean()

(array([3.77, 3.79, 3.28, ..., 0.  , 0.  , 0.  ]), 0.06415824175824175)

In [128]:
mean_absolute_error(y_jp, previsao_jp)

0.10673783673299099