In [17]:
import pickle
import pandas as pd

match_df = pickle.load(open("prepped_match_df.pck", "rb"))

In [18]:
from sklearn.preprocessing import *

enc = LabelEncoder()
match_df["resultClass"] = enc.fit_transform(match_df["resultClass"])

In [51]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(match_df.drop("resultClass", axis=1), match_df["resultClass"], test_size=0.2, shuffle=False)

ValueError: Stratified train/test split is not implemented for shuffle=False

In [20]:
from sklearn.metrics import make_scorer

def score(y_true, y_pred, **kwargs):
    y_true = [x.split(":") for x in enc.inverse_transform(y_true)]
    y_pred = [x.split(":") for x in enc.inverse_transform(y_pred)]
    score_value = 0
    for true, pred in zip(y_true, y_pred):
        if true[0] == pred[0] and true[1] == pred[1]:
            score_value += 5
        elif (int(true[0]) - int(true[1])) == (int(pred[0]) - int(pred[1])):
            score_value += 3
        elif ((true[0] > true[1]) and (pred[0] > pred[1])) or ((true[0] < true[1]) and (pred[0] < pred[1])):
            score_value += 1
    return round(score_value / (len(y_true)/306))

kicktipp_scorer = make_scorer(score, greater_is_better=True)

In [21]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

# KNN Classifier
N Neighbors: 41
CV Score: 338.4

**Score: 322**

In [22]:
%%time

import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

parameters = {
    "n_neighbors": [41],
}

clf = GridSearchCV(knn, parameters, scoring=kicktipp_scorer, cv=tscv, verbose=10)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

score(y_test, y_pred), clf.best_params_, clf.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START n_neighbors=41..............................................
[CV 1/5; 1/1] END .............n_neighbors=41;, score=351.000 total time=   0.1s
[CV 2/5; 1/1] START n_neighbors=41..............................................
[CV 2/5; 1/1] END .............n_neighbors=41;, score=307.000 total time=   0.1s
[CV 3/5; 1/1] START n_neighbors=41..............................................
[CV 3/5; 1/1] END .............n_neighbors=41;, score=362.000 total time=   0.0s
[CV 4/5; 1/1] START n_neighbors=41..............................................
[CV 4/5; 1/1] END .............n_neighbors=41;, score=338.000 total time=   0.0s
[CV 5/5; 1/1] START n_neighbors=41..............................................
[CV 5/5; 1/1] END .............n_neighbors=41;, score=332.000 total time=   0.0s
CPU times: user 809 ms, sys: 2.34 s, total: 3.15 s
Wall time: 367 ms


(318, {'n_neighbors': 41}, 338.0)

In [23]:
from sklearn.ensemble import RandomForestClassifier

base_model = RandomForestClassifier(random_state = 2550, n_jobs=-1)
base_model.fit(X_train, y_train)
y_pred = base_model.predict(X_test)
base_score = score(y_test, y_pred)
base_score

301

In [53]:
%%time
selected_clf = RandomForestClassifier(n_jobs=-1, random_state=1337)

best_par =  {'n_estimators': [540],
  'max_features': ['log2'],
  'max_depth': [10],
  'criterion': ['gini']}

parameters = {
    "n_estimators": np.arange(1, 1000, 1),
    "max_depth": np.arange(2, len(X_train.columns), 1),
    "max_features": ['sqrt', 'log2'],
    'criterion' : ['gini', 'entropy']
}

clf = RandomizedSearchCV(selected_clf, best_par, scoring=kicktipp_scorer, cv=tscv, verbose=10, n_jobs=-1, n_iter=200)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

base_score, score(y_test, y_pred), clf.best_params_, clf.best_score_

Fitting 5 folds for each of 1 candidates, totalling 5 fits




CPU times: user 2.9 s, sys: 465 ms, total: 3.37 s
Wall time: 2.87 s


(301,
 343,
 {'n_estimators': 540,
  'max_features': 'log2',
  'max_depth': 10,
  'criterion': 'gini'},
 344.2)

In [48]:
from tqdm import tqdm

best_score = 0
best_clf = None

for _ in tqdm(range(100)):
    clf = RandomForestClassifier(n_estimators=540, max_features='log2', max_depth=10, criterion="gini", n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    clf_score = score(y_test, y_pred)
    if clf_score > best_score:
        best_score = clf_score
        best_clf = clf
        
best_score

100%|██████████| 100/100 [01:12<00:00,  1.39it/s]


360

In [49]:
saving = False
if saving:
    pickle.dump(best_clf, open("classifier.pck", "wb"))
    pickle.dump(enc, open("encoder.pck", "wb"))