In [1]:
import numpy as np
import pandas as pd
from re import sub
from time import time

from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras_tqdm import TQDMNotebookCallback as ktqdm
from keras.utils import normalize
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import TensorBoard
from keras.optimizers import Adam

from tensorflow.nn import relu, softmax

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import datasets
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
df = pd.read_csv('fifa19.csv')

In [3]:
df.drop(columns=['Unnamed: 0', 'ID', 'Photo', 'Flag', 'Club Logo', 'Real Face', 'Preferred Foot',
                 'Body Type', 'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until'],inplace=True)

## Obiettivo: predire valore e/o salario dei giocatori

Pre-processing: convertire value e wage da string a float

In [6]:
curs=["Release Clause", "Value", "Wage"]

for cur in curs:
    def curr2val(x):
        x = str(x).replace('€', '')
        if 'K' in x: x = float(str(x).replace('K', '')) * 1000
        else: x = float(str(x).replace('M', '')) * 1000000
        return x
    df[cur] = df[cur].apply(curr2val)

Scelgo caratteristiche che penso siano sensate per predire il valore di mercato e stipendio del giocatore

In [7]:
cols=["LS", "ST", "RS", "LW", "LF", "CF", "RF", "RW","LAM", "CAM", "RAM", "LM", "LCM", "CM", "RCM", "RM", "LWB", "LDM","CDM", "RDM", "RWB", "LB", "LCB", "CB", "RCB", "RB"]
for col in cols:
    df[col] = df[col].str[:-2]
    df[col] = df[col].astype(float)

In [8]:
df['Height'] = df['Height'].str.replace("'",'.')
df['Height'] = df['Height'].astype(float)

df['Weight'] = df['Weight'].str[:-3]
df['Weight'] = df['Weight'].astype(float)

In [9]:
df_corr = df.corr()

# fig = plt.figure(figsize=(50,20))
# ax = fig.add_subplot(111)
# cax = ax.matshow(df_corr,cmap='coolwarm', vmin=-1, vmax=1)
# fig.colorbar(cax)

# ticks = np.arange(0,len(df_corr.columns),1)
# ax.set_xticks(ticks)
# ax.set_xticklabels(df_corr.columns)
# plt.xticks(rotation=90)
# ax.set_yticks(ticks)
# ax.set_yticklabels(df_corr.columns)

# plt.show()

In [10]:
labels = []
for label in df_corr:
#     if df_corr['Value'][label] < 0 or df_corr['Value'][label] > 0.5: labels.append(label)
    if df_corr['Value'][label] > 0.5: labels.append(label)
        
df_flt = df[labels]        
df_flt.head()      

Unnamed: 0,Overall,Potential,Value,Wage,International Reputation,Reactions,Release Clause
0,94,94,110500000.0,565000.0,5.0,95.0,226500000.0
1,94,94,77000000.0,405000.0,5.0,96.0,127100000.0
2,92,93,118500000.0,290000.0,5.0,94.0,228100000.0
3,91,93,72000000.0,260000.0,4.0,90.0,138600000.0
4,91,92,102000000.0,355000.0,4.0,91.0,196400000.0


In [11]:
train_perc = 0.75
train_slice = int(len(df_flt) * train_perc)

df = df.sample(frac=1)

train = df_flt.iloc[:train_slice, :]
test = df_flt.iloc[train_slice:, :]

len(train), len(test)

(13655, 4552)

In [12]:
y_train = train.loc[:, ['Value']]
X_train = train.drop(columns='Value')

y_test = test.loc[:, ['Value']]
X_test = test.drop(columns='Value')

In [13]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X_train)
X_train = imputer.transform(X_train)

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(y_train)
y_train = imputer.transform(y_train)

In [14]:
# SCALO I VALORI
# scaler = RobustScaler()
# scaler = scaler.fit(X_train)
# X_train = scaler.transform(X_train)

# X_train_scaled = preprocessing.scale(X_train)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

X_test_scaled = scaler.transform(X_test) 
# X_train_scaled, X_test_scaled

  # This is added back by InteractiveShellApp.init_path()


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size = 0.20, random_state = 42)

In [16]:
def coeff_determination(y_test, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_test-y_pred ))
    SS_tot = K.sum(K.square( y_test - K.mean(y_test) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()))

In [17]:
ts_board = TensorBoard(log_dir='value_predictions/{}'.format(time()))

In [18]:
def mean_abs_error(prediction, target):
    res = 0
    tot = 0
    for i in range(len(target)):
        if target[i][0] > 0 and not np.isnan(prediction[i][0]):
            res += abs(target[i][0] - prediction[i][0])
            tot += 1
#         else: print(target[i], prediction[i][0])

    return round(res / tot, 2)

In [25]:
def train_neural_network(trainX, trainY,
                         hidden_layers,
                         num_epochs=10,
                         weights_learning_rate=0.5,
                         bn_learning_rate=0.5,
                         train_batch_size=32,
                         momentum_rate=0.9,
                         dropout_rate=0.2,
                         ini_weights=None,
                         ini_biases=None,
                         ini_momentums=None,
                         ini_gamma=None,
                         ini_beta=None):
    
    trainY = np.array(trainY).reshape(len(trainY), -1)
    
    layers = hidden_layers + [trainY.shape[1]]
    
    if ini_weights is None:
        weights, biases, momentums, gamma, beta = initialize(layers, trainX.shape[1])
    else:
        weights, biases, momentums, gamma, beta = ini_weights, ini_biases, ini_momentums, ini_gamma, ini_beta
        
    trainX_batches, trainY_batches = generate_batches(trainX, trainY, train_batch_size)
    
    losses = []
    
    # variables used for batch normalization purpose
    expected_mean_linear_inp, expected_var_linear_inp = dict(), dict()
    exp_mean_linear_inp, exp_var_linear_inp = dict(), dict()
    
    for epoch in range(num_epochs):
        
        for layer in range(len(layers)):
            expected_mean_linear_inp[layer] = np.zeros(weights[layer].shape[1])
            expected_var_linear_inp[layer] = np.zeros(weights[layer].shape[1])
            
        for batch in range(len(trainX_batches)):
            trainX_batch = trainX_batches[batch]
            trainY_batch = trainY_batches[batch]
            
            fwd_pass_data = train_forward_pass(trainX_batch, weights, biases, gamma, beta, dropout_rate, type)

            outputs, linear_inp, scaled_linear_inp, mean_linear_inp, var_linear_inp = fwd_pass_data
        
            for layer in range(len(layers)):
                    expected_mean_linear_inp[layer] += mean_linear_inp[layer]
                    expected_var_linear_inp[layer] += var_linear_inp[layer]
                    
            backprop = error_backpropagation(trainX_batch, trainY_batch,
                                             outputs=outputs,
                                             linear_inp=linear_inp,
                                             scaled_linear_inp=scaled_linear_inp,
                                             mean_linear_inp=mean_linear_inp,
                                             var_linear_inp=var_linear_inp,
                                             weights=weights,
                                             biases=biases,
                                             momentums=momentums,
                                             gamma=gamma,
                                             beta=beta,
                                             bn_learning_rate=bn_learning_rate,
                                             weights_learning_rate=weights_learning_rate,
                                             momentum_rate=momentum_rate,
                                             type=type
                                            )
            
            weights, biases, momentums, gamma, beta = backprop
            
        m = train_batch_size
        
        for layer in range(len(layers)):
            exp_mean_linear_inp[layer] = expected_mean_linear_inp[layer] / len(trainX_batches)
            
            if m > 1:
                exp_var_linear_inp[layer] = (float(m) / (m-1)) * expected_var_linear_inp[layer] / len(trainX_batches)
            else:
                exp_var_linear_inp[layer] = expected_var_linear_inp[layer] / len(trainX_batches)
                
        dummy_weights, dummy_biases = scale_weights_dropout(weights, biases, dropout_rate)
        
        outputs = test_forward_pass(trainX,
                                    weights=dummy_weights,
                                    biases=dummy_biases,
                                    gamma=gamma,
                                    beta=beta,
                                    mean_linear_inp=exp_mean_linear_inp,
                                    var_linear_inp=exp_var_linear_inp,
                                    type=type
                                   )
        

        curr_loss = loss_reg(outputs, trainY)
        cond = len(losses) > 1 and curr_loss > losses[-1] > losses[-2]
        
        if cond:
            weights_learning_rate /= float(2.0)
        
        losses.append(curr_loss)
    
    weights, biases = scale_weights_dropout(weights, biases, dropout_rate)
    
    model = (weights, biases, momentums, gamma, beta, exp_mean_linear_inp, exp_var_linear_inp)
    
    return model

In [None]:
def standardize_mean_var(mydata, mean=None, var=None):
    if mean is None:
        mean = np.mean(mydata, axis=0)
    if var is None:
        var = np.var(mydata, axis=0)

    std_data = (mydata - mean) * (var + 1e-5) ** -0.5

    return std_data, mean, var

In [None]:
def scale_weights_dropout(weights, biases, dropout_rate):
    scaled_weights, scaled_biases = dict(), dict()
    
    for layer in weights:
        scaled_weights[layer] = weights[layer] * (1 - dropout_rate)
        scaled_biases[layer] = biases[layer] * (1 - dropout_rate)
    
    return scaled_weights, scaled_biases

In [22]:
N_FEATURES = X_train.shape[1]
N_CLASSES = 1
RANDOM_SEED = 42

In [23]:
class Neural_Network:
    
    def __init__(self, neurons, batchsize, stop_function, stop_parameter):
        self.input_size = N_FEATURES # dimensione immagine
        self.output_size = N_CLASSES # i 10 numeri da riconoscere
        self.neurons = neurons
        self.batchsize = batchsize
        self.stop_f = stop_function # 2
        self.stop_p = stop_parameter
        self.best = 0.
        self.same = 0
        self.iteration = 0
        
        # Standardize random weights
        np.random.seed(RANDOM_SEED)
        hidden_layer = np.random.rand(self.neurons, self.input_size + 1) / self.neurons # matrice neurons * input_size
        output_layer = np.random.rand(self.output_size, self.neurons + 1) / self.output_size
        self.layers = [hidden_layer, output_layer]

    def train(self, X_training, y_training, X_testing, y_testing):
        
        accu_train = [0.,0.]
        
        # Batch Setting
        len_batch_train = len(X_training.shape[0])
        len_batch_test = len(X_testing.shape[0])
        if(self.batchsize > 0 and self.batchsize <= 1):
            len_batch_train = int(np.ceil(len_batch_train * self.batchsize))
            len_batch_test = int(np.ceil(len_batch_test * self.batchsize))
        
        # Start prints 
        self.start_time = dt.datetime.now()
        print('-- Training Session Start (%s) --' % (self.start_time))
        typeTrainingPrint = "Stop Function: "    
        if self.stop_f == 0:
            typeTrainingPrint += str(self.stop_p)+" epochs"
        elif self.stop_f == 1:
            typeTrainingPrint += str(self.stop_p)+" epoch(s) w/o improvements"
        elif self.stop_f == 2:
            typeTrainingPrint += "improvements below "+str(self.stop_p)+"%"
        print('\nNeurons: %d\nBatch Train: %d\nBatch Test: %d\n%s\n' % (self.neurons,len_batch_train,len_batch_test,typeTrainingPrint))
        
        # Divide training and testing batches
#         test_output = testing[0:len_batch_test][0:len_batch_test]
#         test_input = training[0:len_batch_train][0:len_batch_train]
#         inputs = training[0][0:len_batch_train]
#         targets = np.zeros((len_batch_train, 10))
#         for i in range(len_batch_train):
#             targets[i, training[1][i]] = 1

        # Performs iterations
        while not self.is_stop_function_enabled(accu_train[1]):
            
            self.iteration += 1
            
            for input_vector, target_vector in zip(inputs, targets):
                print('input_vec --> {}\ntarget_vect  --> {}\n'.(input_vector, target))
                self.backpropagate(input_vector, target_vector)
            
            # Accuracy
            accu_test = self.accu(X_testing, y_testing)
#             accu_train = self.accu(test_input)
            
            # Messages
            if (self.iteration == 1 or self.iteration % 10 == 0):
                self.print_message_iter(self.iteration,accu_test,accu_train,self.ETAepoch(self.start_time))
                
        # Print last epoch
        if (self.iteration % 10 != 0):
            self.print_message_iter(self.iteration,accu_test,accu_train,self.ETAepoch(self.start_time))

        # Final message
        print('\n-- Training Session End (%s) --' % (dt.datetime.now()))

    def feed_forward(self, input_vector):
        outputs = []
        for layer in self.layers:
            input_with_bias = np.append(input_vector, 1)   # Ajout constante
            output = np.inner(layer, input_with_bias)
            output = special.expit(output) # expit is the sigmoid function
            outputs.append(output)
            # The output is the input of the next layer
            input_vector = output
        return outputs

    def backpropagate(self, input_vector, target):
        c = 10**(-4) + 10**(-1)/math.sqrt(self.iteration)  # Learning coefficient
        hidden_outputs, outputs = self.feed_forward(input_vector)

        # Calculation of partial derivatives for the output layer and subtraction
        output_deltas = outputs * (1 - outputs) * (outputs - target)
        self.layers[-1] -= c*np.outer(output_deltas, np.append(hidden_outputs, 1))

        # Calculation of partial derivatives for the hidden layer and subtraction
        hidden_deltas = hidden_outputs * (1 - hidden_outputs) * np.dot(np.delete(self.layers[-1], self.neurons, 1).T, output_deltas)
        self.layers[0] -= c*np.outer(hidden_deltas, np.append(input_vector, 1))

    def predict(self, input_vector):
        return self.feed_forward(input_vector)[-1]

    def predict_one(self, input_vector):
        return np.argmax(self.feed_forward(input_vector)[-1])

    def accu(self, testing_batch):
        res = np.zeros((10, 2))
        for k in range(len(testing_batch[1])):
            if self.predict_one(testing_batch[0][k]) == testing_batch[1][k]:
                res[testing_batch[1][k]] += 1
            else:
                res[testing_batch[1][k]][1] += 1
        total = np.sum(res, axis=0)
        each = [res[k][0]/res[k][1] for k in range(len(res))]
        min_c = sorted(range(len(each)), key=lambda k: each[k])[0]
        return np.round([each[min_c]*100, total[0]/total[1]*100, min_c], 2)
    
    def is_stop_function_enabled(self,accuracy):
        if self.stop_f == 0:
            if self.iteration < self.stop_p:
                return False
            else:
                return True
        elif self.stop_f == 1:
            if accuracy > self.best or self.iteration == 0:
                self.same = 0
                self.best = accuracy
                return False
            else:
                self.same += 1
                if self.same < self.stop_p:
                    return False
                else:
                    return True
        elif self.stop_f == 2:
            if accuracy > self.best + self.stop_p or self.iteration == 0:
                self.best = accuracy
                return False
            else:
                return True
    
    def print_message_iter(self,iteration,accu_test,accu_train,eta):
        len_eta = len(eta)
        space_fill = 6 - len_eta
        eta = "("+eta+")"
        for _ in range(space_fill):
            eta += " "
        message = 'Epoch '+str(self.iteration).zfill(3) + " "+eta+" "
        message += 'Accuracy TRAIN: '+str(accu_train[1]).zfill(4)+'%\t'
        message += 'Accuracy TEST: '+str(accu_test[1]).zfill(4)+'%\t'
        message += 'Min: '+ str(accu_test[0]).zfill(4)+ '% ('+str(int(accu_test[2]))+')'
        print(message)
    
    def ETAepoch(self,start_time):
        diff = dt.datetime.now() - self.start_time
        eta = divmod(diff.days * 86400 + diff.seconds, 60)
        if eta[0] != 0:
            ret = str(eta[0])+"m"
        else:
            ret = ""
        ret += str(eta[1])+"s"
        return ret
        
    def getWeights(self):
        return self.layers

In [24]:
nn = Neural_Network(neurons=300, batchsize=250, stop_function=2, stop_parameter=0.01)
# nn.train()