In [1]:
import numpy as np
import pandas as pd
from re import sub
from time import time
import pickle

from comet_ml import Experiment

from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras_tqdm import TQDMNotebookCallback as ktqdm
from keras.utils import normalize
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import TensorBoard
from keras.optimizers import Adam, SGD
from keras.regularizers import l1, l2
from keras.initializers import RandomUniform, RandomNormal
from keras.layers.advanced_activations import LeakyReLU

from tensorflow.nn import relu, softmax

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, r2_score

import scipy.stats as st

import seaborn as sns

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
# experiment = Experiment(api_key="CndJ3YmXyZcxmsV8EccJEuu9C",
#                         project_name="NN_Thesis", workspace="paologalligit")

In [3]:
df = pd.read_csv('fifa19.csv')

In [4]:
df.drop(columns=['Unnamed: 0', 'ID', 'Photo', 'Flag', 'Club Logo', 'Real Face', 'Preferred Foot',
                 'Body Type', 'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until'],inplace=True)

## Obiettivo: predire valore dei giocatori

Pre-processing: convertire value, wage e release clause da string a float

In [5]:
curs=["Release Clause", "Value", "Wage"]

for cur in curs:
    def curr2val(x):
        x = str(x).replace('€', '')
        if 'K' in x: x = float(str(x).replace('K', '')) * 1000
        else: x = float(str(x).replace('M', '')) * 1000000
        return x
    df[cur] = df[cur].apply(curr2val)
    

Individuare eventuali outlier nella colonna value

In [6]:
def detect_outlier(data, threshold = 3):
    outliers=[]
    mean = np.mean(data)
    std = np.std(data)
    
    for y in data:
        score= (y - mean) / std 
        if np.abs(score) > threshold:
            outliers.append(y)
    return outliers

In [7]:
min_out = min(detect_outlier(df['Value'], threshold = 2))

df = df[df['Value'] < min_out] 
df = df[df['Value'] > 0]

Conversione in interi per le altre label

In [8]:
cols=["LS", "ST", "RS", "LW", "LF", "CF", "RF", "RW","LAM", "CAM", "RAM", "LM", "LCM", "CM", "RCM", "RM", "LWB", "LDM","CDM", "RDM", "RWB", "LB", "LCB", "CB", "RCB", "RB"]
for col in cols:
    df[col] = df[col].str[:-2]
    df[col] = df[col].astype(float)

In [9]:
df['Height'] = df['Height'].str.replace("'",'.')
df['Height'] = df['Height'].astype(float)

df['Weight'] = df['Weight'].str[:-3]
df['Weight'] = df['Weight'].astype(float)

Calcolo correlazione tra i valori per scegliere colonne significative

In [10]:
df_corr = df.corr()

# fig = plt.figure(figsize=(50,20))
# ax = fig.add_subplot(111)
# cax = ax.matshow(df_corr,cmap='coolwarm', vmin=-1, vmax=1)
# fig.colorbar(cax)

# ticks = np.arange(0,len(df_corr.columns),1)
# ax.set_xticks(ticks)
# ax.set_xticklabels(df_corr.columns)
# plt.xticks(rotation=90)
# ax.set_yticks(ticks)
# ax.set_yticklabels(df_corr.columns)

# plt.show()

In [11]:
labels = []
for label in df_corr:
#     if df_corr['Value'][label] < 0 or df_corr['Value'][label] > 0.5: labels.append(label)
    if df_corr['Value'][label] > 0.55: labels.append(label)
        
df_flt = df[labels]        
df_flt.head()      

Unnamed: 0,Overall,Potential,Value,Wage,LCM,CM,RCM,Reactions,Release Clause
41,88,88,4000000.0,77000.0,,,,79.0,7400000.0
102,85,85,9000000.0,38000.0,70.0,70.0,70.0,85.0,15300000.0
108,85,85,9000000.0,57000.0,63.0,63.0,63.0,83.0,17100000.0
152,84,84,4200000.0,95000.0,63.0,63.0,63.0,80.0,6900000.0
201,83,83,13000000.0,70000.0,,,,78.0,24700000.0


Mescolo le righe del dataset

In [12]:
df_flt = df_flt.sample(frac=1)

train_slice = int(len(df_flt) * 0.8)

train = df_flt[:train_slice]
test = df_flt[train_slice:]

In [13]:
y_train = train.loc[:, ['Value']]
X_train = train.drop(columns='Value')

y_test = test.loc[:, ['Value']]
X_test = test.drop(columns='Value')

Sostiuisco eventuali valori nan con la media della colonna

In [14]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X_train)
X_full = imputer.transform(X_train)

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(y_train)
y_full = imputer.transform(y_train)

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X_test)
X_test_full = imputer.transform(X_test)

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(y_test)
y_test_full = imputer.transform(y_test)

Scalo i valori, sia per i caratteri che per il target

In [15]:
# scaler = RobustScaler()
# scaler = scaler.fit(X_train)
# X_train = scaler.transform(X_train)

# X_train_scaled = preprocessing.scale(X_train)
scaler = StandardScaler().fit(X_full)
X_scaled = scaler.transform(X_full)

scaler_train = StandardScaler().fit(y_full)
y_scaled = scaler_train.transform(y_full) 
# X_train_scaled, X_test_scaled

scaler = StandardScaler().fit(X_test_full)
X_test_scaled = scaler.transform(X_test_full)

### Salvo i dati di training e testing

In [16]:
with open('fifa_training_X', 'wb') as file:
    pickle.dump(X_scaled, file)
    
with open('fifa_training_y', 'wb') as file:
    pickle.dump(y_scaled, file)    

In [17]:
def coeff_determination(y_test, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_test-y_pred ))
    SS_tot = K.sum(K.square( y_test - K.mean(y_test) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()))

In [18]:
def build_nn(neurons):
    model = Sequential()
    # Adding the input layer
    model.add(Dense(256, input_dim = neurons, activation='relu'))#, kernel_regularizer=l2(0.001)))#, kernel_initializer=RandomNormal(mean=0.0, stddev=0.05, seed=None)))
#     model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.1))
    
    # Adding the first hidden layer
    model.add(Dense(128, activation='relu'))#, kernel_regularizer=l2(0.001)))#, kernel_initializer=RandomNormal(mean=0.0, stddev=0.05, seed=None)))
#     model.add(LeakyReLU(alpha=0.01))
    model.add(Dropout(0.1))
    
    # Adding the second hidden layer
    model.add(Dense(64, activation='relu'))#, kernel_regularizer=l2(0.001)))#, kernel_initializer=RandomNormal(mean=0.0, stddev=0.05, seed=None)))
#     model.add(LeakyReLU(alpha=0.01))
    
    # Adding the output layer
    model.add(Dense(1, activation = 'linear'))
    
    return model

In [19]:
ts_board = TensorBoard(log_dir='value_predictions_v3/{}'.format('kfold_' + str(time())))

In [20]:
def mean_abs_error(prediction, target):
    res = 0
    tot = 0
    for i in range(len(target)):
        if target[i][0] != 0 and not np.isnan(prediction[i][0]):
#             print(prediction[i][0], target[i][0])
            res += abs(prediction[i][0] - target[i][0])
            tot += 1
    print(tot, len(target), len(prediction))
    return round(res / tot, 2)

# Cross-validation
### K-Fold

In [21]:
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
kfold = KFold(n_splits=3, random_state=42, shuffle=False)

In [22]:
scores = []
abs_errors = []

nn = build_nn(X_scaled.shape[1])

opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# opt = SGD(lr=0.01, momentum=0.9)
nn.compile(optimizer = opt, loss = 'mean_absolute_error', metrics = [coeff_determination])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [23]:
%%time

stage = 0

for train, test in kfold.split(X_scaled):
    
    stage += 1
    
    print('#'*60)
    print('\nSTAGE --> {}\n'.format(stage))
    print('#'*60)
    
    X_train_round, X_test_round, y_train_round, y_test_round = X_scaled[train], X_scaled[test], y_scaled[train], y_scaled[test]
        
    nn.fit(X_train_round, y_train_round, batch_size=125, epochs=50, shuffle=True)
    
    # predict
    res = nn.predict(X_test_round)
    print('R2 of round: ', round(r2_score(y_test_round, res), 4))
    
    # evaluate
    loss, acc = nn.evaluate(X_train_round, y_train_round, verbose=1)
    
    reversed_res = scaler_train.inverse_transform(res)
    reversed_label = scaler_train.inverse_transform(y_test_round)
    abs_errors.append(
        {'ACC': acc * 100, 'MAE': mean_abs_error(reversed_res, reversed_label), 'MAX': max(reversed_label), 'MEAN': round(np.mean(reversed_label), 4)}
    )
    
    print("loss: %.4f, acc: %.4f%%" % (loss, acc*100))
    if acc < 0:
        print('X_test round:\n')
        print(X_test_round)
        print('#'*60)
        print('y_test_round:\n')
        print(y_test_round)
        print('#'*60)
    
    scores.append(acc * 100)

############################################################

STAGE --> 1

############################################################
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
R2 of round:  0.967
4643 4643 4643
loss: 0.0690, acc: 97.0962%
############################################################

STAGE --> 2

############################################################
Epoch 1/50
Epoch 2/50
Epoc

Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
R2 of round:  0.9656
4642 4642 4642
loss: 0.0794, acc: 96.7702%
CPU times: user 1min 7s, sys: 4.05 s, total: 1min 11s
Wall time: 38.1 s


In [24]:
print("%.2f%% (+/- %.2f%%)" % (np.mean(scores), np.std(scores) * 100))
abs_errors

97.33% (+/- 57.58%)


[{'ACC': 97.09622139049976,
  'MAE': 179247.28,
  'MAX': array([13500000.]),
  'MEAN': 1666383.8036},
 {'ACC': 98.12168336004484,
  'MAE': 161641.2,
  'MAX': array([13500000.]),
  'MEAN': 1693589.2742},
 {'ACC': 96.77023689679588,
  'MAE': 207811.4,
  'MAX': array([13500000.]),
  'MEAN': 1679520.6807}]

In [25]:
res = nn.predict(X_test_scaled)

# y_test_wrong = y_scaled[:3483]

scaler_test = StandardScaler().fit(y_test.values)
res = scaler_test.inverse_transform(res)

mean_abs_error(res, y_test.values), max(y_test.values), round(np.mean(y_test.values), 2), round(r2_score(y_test.values, res), 4)

# scaler_test = StandardScaler().fit(y_test)
# res = scaler_test.inverse_transform(res)
# y_test_wrong = scaler_test.inverse_transform(y_test_wrong)
# print(res)

# mean_abs_error(res, y_test_wrong), max(y_test_wrong), round(np.mean(y_test_wrong), 2), round(r2_score(y_test_wrong, res), 4)

3483 3483 3483


(219715.97, array([13500000.]), 1716724.09, 0.9615)

In [27]:
nn.save_weights('nn_weights.h5')

In [28]:
from __future__ import print_function
import h5py

class WeightsUtils:

    def __init__(self, source):
        self.source = source

    def extract_weights(self, file, debug):
        f = h5py.File(file)
        d = {}
        try:
            layer_count = 1
            for layer, g in f.items():
                for p_name in g.keys():
                    param = g[p_name]
                    for k_name in param.keys():
                        ls = []
                        for i in param.get(k_name):
                            ls.append(i)
                        if debug: print("      {}/{}: {} x {}".format(p_name, k_name, len(ls), len(ls[0]) if k_name.startswith('kernel') else 0))
                        label = k_name.split(':')[0]
                        d['layer_{}_{}'.format(label, layer_count)] = ls
                    layer_count += 1
        finally:
            f.close()

        return d
    
    def get_weights(self, debug=False):
        if isinstance(self.source, str):
            weights_dict = self.extract_weights(self.source, debug)

            w, b = [], []
            for k, v in weights_dict.items():
                if k.startswith('layer_kernel'): w.append(v)
                else: b.append(v)
                
            return [i for i in zip(w, b)]
        
        else:
            w = self.source.get_weights()
            return [i for i in zip(*[iter(w)] * 2)]

In [29]:
W = WeightsUtils('nn_weights.h5')

w = W.get_weights(True)

      dense_1/bias:0: 256 x 0
      dense_1/kernel:0: 8 x 256
      dense_2/bias:0: 128 x 0
      dense_2/kernel:0: 256 x 128
      dense_3/bias:0: 64 x 0
      dense_3/kernel:0: 128 x 64
      dense_4/bias:0: 1 x 0
      dense_4/kernel:0: 64 x 1


In [24]:
from __future__ import print_function

import h5py

In [41]:
def print_structure(weight_file_path):
    """
    Prints out the structure of HDF5 file.

    Args:
      weight_file_path (str) : Path to the file to analyze
    """
    f = h5py.File(weight_file_path)
    try:
        if len(f.attrs.items()):
            print("{} contains: ".format(weight_file_path))
            print("Root attributes:")
        for key, value in f.attrs.items():
            print("  {}: {}".format(key, value))

        if len(f.items())==0:
            return 

        for layer, g in f.items():
            print("  {}".format(layer))
            print("    Attributes:")
            for key, value in g.attrs.items():
                print("      {}: {}".format(key, value))

            print("    Dataset:")
            for p_name in g.keys():
                param = g[p_name]
                for k_name in param.keys():
                    ls = []
                    for i in param.get(k_name):
                        ls.append(i)
                    print("      {}/{}: {} x {}".format(p_name, k_name, len(ls), len(ls[0]) if k_name.startswith('kernel') else 0))
    finally:
        f.close()

In [42]:
print_structure('nn_weights.h5')

nn_weights.h5 contains: 
Root attributes:
  backend: b'tensorflow'
  keras_version: b'2.2.4'
  layer_names: [b'dense_1' b'dropout_1' b'dense_2' b'dropout_2' b'dense_3' b'dense_4']
  dense_1
    Attributes:
      weight_names: [b'dense_1/kernel:0' b'dense_1/bias:0']
    Dataset:
      dense_1/bias:0: 256 x 0
      dense_1/kernel:0: 8 x 256
  dense_2
    Attributes:
      weight_names: [b'dense_2/kernel:0' b'dense_2/bias:0']
    Dataset:
      dense_2/bias:0: 128 x 0
      dense_2/kernel:0: 256 x 128
  dense_3
    Attributes:
      weight_names: [b'dense_3/kernel:0' b'dense_3/bias:0']
    Dataset:
      dense_3/bias:0: 64 x 0
      dense_3/kernel:0: 128 x 64
  dense_4
    Attributes:
      weight_names: [b'dense_4/kernel:0' b'dense_4/bias:0']
    Dataset:
      dense_4/bias:0: 1 x 0
      dense_4/kernel:0: 64 x 1
  dropout_1
    Attributes:
      weight_names: []
    Dataset:
  dropout_2
    Attributes:
      weight_names: []
    Dataset:


In [49]:
def extract_weights(file):
    f = h5py.File(file)
    d = {}
    try:
        layer_count = 1
        for layer, g in f.items():
            for p_name in g.keys():
                param = g[p_name]
                for k_name in param.keys():
                    ls = []
                    for i in param.get(k_name):
                        ls.append(i)
                    print("      {}/{}: {} x {}".format(p_name, k_name, len(ls), len(ls[0]) if k_name.startswith('kernel') else 0))
                    label = k_name.split(':')[0]
                    d['layer_{}_{}'.format(label, layer_count)] = ls
                layer_count += 1
    finally:
        f.close()
        
    return d

In [53]:
weights_dict = extract_weights('nn_weights.h5')

weights_dict

      dense_1/bias:0: 256 x 0
      dense_1/kernel:0: 8 x 256
      dense_2/bias:0: 128 x 0
      dense_2/kernel:0: 256 x 128
      dense_3/bias:0: 64 x 0
      dense_3/kernel:0: 128 x 64
      dense_4/bias:0: 1 x 0
      dense_4/kernel:0: 64 x 1


{'layer_bias_1': [0.8185977,
  -1.0483234,
  -0.66221845,
  -1.047942,
  0.37697122,
  -0.60827893,
  -0.8177658,
  -0.622535,
  -0.23320833,
  -0.84988487,
  -0.37960055,
  0.39535055,
  -0.29293737,
  -0.3604512,
  -0.76890594,
  -0.5453999,
  -0.81250376,
  -1.018774,
  -1.0519116,
  -0.3908316,
  -1.0365212,
  -0.50719064,
  -1.093273,
  -0.13633199,
  -0.53808224,
  -0.09369279,
  0.21824329,
  -0.6178887,
  -0.015273052,
  -0.64580154,
  -0.23372254,
  -0.9242156,
  -0.38784048,
  -0.6950166,
  -0.45955375,
  -0.135111,
  -0.32414165,
  -0.12304451,
  0.7119665,
  -0.5480601,
  -0.857072,
  -0.22178294,
  -0.34486148,
  -0.297679,
  -1.0472095,
  -0.2291948,
  -0.64002764,
  -0.03291506,
  -0.52582645,
  -0.64863825,
  -0.46412027,
  -0.17174217,
  -0.034172915,
  -1.2466195,
  -0.043980066,
  -1.282097,
  -0.7681374,
  -0.23089725,
  -0.32606235,
  -0.18743114,
  -0.3463108,
  -0.17533253,
  -0.47850454,
  -0.66753745,
  -0.3857635,
  -0.70345306,
  -0.46729112,
  -0.25426722,
 

In [54]:
with open('nn_weights', 'wb') as file:
    pickle.dump(weights_dict, file)

In [63]:
with open('nn_weights', 'rb') as file:
        d = pickle.load(file)
        print(d)

<class 'numpy.ndarray'>


In [1]:
def set_pruned_layers(pruning, weights):
    layers = weights
    num_layers = len(layers)
    mask = []
    v = []
    epoch = 0

    for i in range(num_layers):
        W=layers[i][0]
        m = np.abs(W) > np.percentile(np.abs(W), pruning)
        mask.append(m)	  
        W_pruned = W * m
        layers[i][0] = W_pruned
        v.append([0, 0])
    return layers, mask, v, epoch

def mask_update_layers(deltasUpd, momentumUpdate):
    for i in range(self.nHidden + 1):
        self.layers[i][0] += (deltasUpd[i][0] + momentumUpdate * self.v[i][0]) * self.mask[i]
        self.layers[i][1] += deltasUpd[i][1] + momentumUpdate * self.v[i][1]