In [72]:
# tools
import pandas as pd
import numpy as np
from time import time

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# prepare data 
from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

# metrics
from sklearn.metrics import f1_score, accuracy_score

## Preparing data

In [64]:
full = pd.read_csv('../data/training/train_final.csv')
full

Unnamed: 0,home_result,0_0,1_0,2_0,3_0,4_0,5_0,6_0,7_0,8_0,...,14_26,15_26,16_26,17_26,18_26,19_26,20_26,21_26,22_26,23_26
0,lose,-0.833564,11.894702,9.797198,0.690392,-0.399289,-0.196324,-0.271997,1.064701,-1.178944,...,1.037913,0.891078,0.420644,0.077849,-0.069981,-0.705365,0.952756,1.854779,-1.037659,-0.282706
1,win,-1.341095,8.727017,7.268857,0.476498,-0.217216,-0.160226,-0.007719,0.360472,-0.856369,...,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000
2,win,-0.645374,10.338912,8.351297,0.264416,0.377598,-0.239645,-0.172684,2.751132,0.671696,...,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000
3,win,-0.455534,15.332285,12.749199,0.857845,-0.496018,-0.299477,-0.234882,0.315938,-0.390156,...,-0.111448,0.046474,0.630581,-1.904343,-0.748645,0.929946,1.764384,0.233507,-0.252928,1.420810
4,win,-0.728257,12.335970,9.748855,0.095955,-0.098054,-0.104447,-1.154838,0.704880,-1.150701,...,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1135,win,-0.942070,15.881339,14.056281,1.229575,-1.038763,-1.182373,-1.298392,-1.673618,-1.044084,...,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000
1136,win,-1.327757,8.190598,6.862505,0.450774,-0.281004,-0.101381,0.100049,0.008824,-0.021895,...,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000
1137,lose,-1.187595,13.179885,11.210599,0.516861,-0.516988,-0.648351,-0.633078,-1.161408,-0.969994,...,-1.850484,0.279975,1.544454,1.299444,-0.837068,-1.799868,0.270484,2.016364,0.713818,0.394452
1138,win,-0.360106,15.206365,11.933520,-0.441674,0.409827,1.008120,-0.185278,-1.363143,4.520670,...,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000,-100.000000


### Exploratory Data Analysis

### Splitting the dataset into train/dev/test

In [66]:
X = full.iloc[:,1:]
y = full.iloc[:, 0]

In [67]:
y = y.replace(['lose', 'win', 'draw'], [0, 2, 1])  # convert into int to feed to the models

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)  # train and test set
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=.1, random_state=42)  # train and dev set

In [71]:
print("Number of training examples: ", X_train.shape[0])
print("Numer of test examples: ", X_test.shape[0])
print("Number of development examples: ", X_dev.shape[0])

Number of training examples:  923
Numer of test examples:  114
Number of development examples:  103


## Training the models

In [73]:
def train_classifier(clf, X_train, y_train):
    ''' Fits a classifier to the training data. '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time()
    clf.fit(X_train, y_train)
    end = time()
    
    # Print the results
    print ("Trained model in {:.4f} seconds".format(end - start))

    
def predict_labels(clf, features, target):
    ''' Makes predictions using a fit classifier based on F1 score. '''
    
    # Start the clock, make predictions, then stop the clock
    y_pred = clf.predict(features)
    
    return f1_score(target, y_pred, average='weighted'), accuracy_score(target, y_pred)


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print ("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))

In [74]:
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
clf_C = xgb.XGBClassifier(seed = 82)

train_predict(clf_A, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_B, X_train, y_train, X_test, y_test)
print('')
train_predict(clf_C, X_train, y_train, X_test, y_test)
print('')

Training a LogisticRegression using a training set size of 923. . .
Trained model in 0.4310 seconds
F1 score and accuracy score for training set: 0.5760 , 0.6024.
F1 score and accuracy score for test set: 0.4702 , 0.5000.

Training a SVC using a training set size of 923. . .
Trained model in 0.4514 seconds
F1 score and accuracy score for training set: 0.4294 , 0.4897.
F1 score and accuracy score for test set: 0.4226 , 0.4825.

Training a XGBClassifier using a training set size of 923. . .
Trained model in 6.3742 seconds
F1 score and accuracy score for training set: 1.0000 , 1.0000.
F1 score and accuracy score for test set: 0.4444 , 0.4561.



### Hyperparameters tuning

### Results on the test set

## Error analysis