In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import load_model
#from google.colab import drive
import matplotlib.pyplot as plt


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight

In [None]:
df = pd.read_csv('labeled_data.csv')

In [None]:
def split_test_train(df, doped = 0):
    '''
    SUMMARY
    Splits dataframe into test and train set
    PARAMETERS
    pd.DataFrame: (df) Dataframe to split
    int: (doped) default = 0, 1 if doped
    OUTPUT
    np.Array: x_train, x_test, y_train, y_test
    '''
    df = df[df['race_class'].isin(['(2.UWT)' ,'(WT)' , '(WC)' ,'(1.UWT)' , '(1.Pro)' , '(2.Pro)' ])]
    # get all unique rider names
    names = df.rider_name.unique()

    # prepare lists
    x = []
    y = []
    i = 0
    # split data into one sequence per rider
    for name in names:

        rider_df = df[df['rider_name'] == name].copy()
        years = rider_df.year.unique()
        rider_df.sort_values(by="date", inplace=True)
        group = rider_df.groupby('year')

        for year in years:
            perf = group.get_group(year)
        perf = rider_df[['finish_pos', 'gc_pos', 'rider_age',
                        'uci_points', 'points', 'race_ranking', 'vertical_meters',
                        'startlist_quality_score']].fillna(0)
        #perf['gc_pos'] = perf['gc_pos'].fillna(0)
        y.append(doped)

        x.append(perf.to_numpy())


    return train_test_split(x, y, random_state= 1)

###########################################################################################

def load_data(df):

    # load dataframe
    data = df
    clean = data[data['doped'] == 0]
    doped = data[data['doped'] == 1]

    # get train and test sets
    x_train, x_test, y_train, y_test = split_test_train(clean)
    x_train_doped, x_test_doped, y_train_doped, y_test_doped = split_test_train(doped, 1)

    x_train = np.append(x_train, x_train_doped)
    x_test = np.append(x_test, x_test_doped)
    y_train = np.append(y_train, y_train_doped)
    y_test = np.append(y_test, y_test_doped)

    x_test, x_val, y_test, y_val = train_test_split(x_test,y_test, random_state = 1)

    return x_train, x_test, y_train, y_test, x_val, y_val

In [None]:
%%time
model = keras.models.Sequential()
model.add(LSTM((128), batch_input_shape =(None, None, 8), return_sequences = False ))
model.add(Dense(1))

model.compile(
    loss = keras.losses.BinaryCrossentropy(from_logits = True),
    optimizer = keras.optimizers.Adam(learning_rate=0.001),
    metrics = ['accuracy', keras.metrics.AUC(from_logits = True)])

x_train, x_test, y_train, y_test, x_val, y_val = load_data(df)


model.summary()

In [None]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train, dtype = 'float32')
x_test = keras.preprocessing.sequence.pad_sequences(x_test, dtype = 'float32')
x_val = keras.preprocessing.sequence.pad_sequences(x_val, dtype = 'float32')

In [None]:
class_weights = compute_class_weight(class_weight= 'balanced', classes=np.unique(y_train),y = y_train)
labels = [0,1]
class_weights = dict(zip(labels, class_weights))

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=200, restore_best_weights=True)
history = model.fit(tf.convert_to_tensor(x_train), tf.convert_to_tensor(y_train), epochs=75, batch_size=50, verbose=1,
                    class_weight=class_weights,
                    validation_data=(tf.convert_to_tensor(x_test), tf.convert_to_tensor(y_test)), validation_freq=1,
                    use_multiprocessing=True, callbacks=[callback])
model.save('model_128_BCE_val_rs.h5')
#with open('output_training.txt', 'w') as f:
#    f.write(output.stdout)

In [None]:
f = plt.figure()
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.plot(history.history['accuracy'], label = 'accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')

plt.legend()
plt.show()
f.savefig("training_128_BCE_val_rs.png", bbox_inches='tight', dpi=600

In [None]:
model.evaluate(x_val, y_val, verbose = 1, return_dict = True)

In [None]:
pred = model.predict(x_val)
y_pred = [0]*pred.shape[0]
for i in range(pred.shape[0]):
    if pred[i]>0:
        y_pred[i] = 1

y_pred = pd.Series(y_pred, name = 'Predicted')
y_act = pd.Series(y_val, name = 'Actual')
conf_matrix = pd.crosstab(y_pred, y_act)
print(conf_matrix)

precision = conf_matrix.iloc[1][1]/(conf_matrix.iloc[1][0] + conf_matrix.iloc[1][1])
recall = conf_matrix.iloc[1][1]/(conf_matrix.iloc[1][1]+ conf_matrix.iloc[0][1])
f1 = 2*(precision*recall)/(precision+ recall)
#print("F1 score:", f1)
print("Precision:", precision)
print("Recall:", recall)