### Some markdown is never a bad idea


In [1]:
import pandas as pd
import numpy as np
import time
# visualization modules
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# Machine learning models
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pickle

In [2]:
# Reading the file from disk
matches = pd.read_csv('data/seasons_merged.csv')
letter_to_result = {'H': 1, 'D': 0, 'A': -1}

matches

Unnamed: 0.1,Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HY,AY,HR,AR,B365H,B365D,B365A,BbAvH,BbAvD,BbAvA
0,0,2009-08-15,Aston Villa,Wigan,0,2,-1,0,1,A,...,2,2,0,0,1.67,3.60,5.50,1.66,3.51,5.33
1,1,2009-08-15,Blackburn,Man City,0,2,-1,0,1,A,...,2,1,0,0,3.60,3.25,2.10,3.37,3.24,2.12
2,2,2009-08-15,Bolton,Sunderland,0,1,-1,0,1,A,...,2,1,0,0,2.25,3.25,3.25,2.24,3.20,3.15
3,3,2009-08-15,Chelsea,Hull,2,1,1,1,1,D,...,1,2,0,0,1.17,6.50,21.00,1.17,6.26,16.39
4,4,2009-08-15,Everton,Arsenal,1,6,-1,0,3,A,...,0,0,0,0,3.20,3.25,2.30,3.07,3.21,2.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,3795,2019-12-05,Liverpool,Wolves,2,0,1,1,0,H,...,0,2,0,0,1.30,6.00,11.00,1.30,5.62,10.17
3796,3796,2019-12-05,Man United,Cardiff,0,2,-1,0,1,A,...,3,3,0,0,1.28,6.50,11.00,1.28,6.18,10.10
3797,3797,2019-12-05,Southampton,Huddersfield,1,1,0,1,0,H,...,0,1,0,0,1.44,4.75,8.50,1.42,4.81,7.64
3798,3798,2019-12-05,Tottenham,Everton,2,2,0,1,0,H,...,0,2,0,0,2.20,3.50,3.50,2.09,3.51,3.58


In [3]:
def get_n_last_matches(matches, date, team, n = 10):
    '''
    Get the last n matches of a given team.
    '''
    # All matches with a given team
    team_matches = matches[(matches['HomeTeam'] == team) | (matches['AwayTeam'] == team)]
    
    #Filter n last matches from team matches
    n_last_matches = (team_matches[team_matches.Date < date]
                      .sort_values(by = 'Date', ascending = False)
                      .iloc[0:n,:])
    
    return n_last_matches


def get_n_last_matches_against_each_other(matches, date, home_team, away_team, n = 5):
    ''' 
    Get the last n matches of two given teams. If possible, else
    get all matches available
    '''
    
    home_matches = matches[(matches['HomeTeam'] == home_team) & (matches['AwayTeam'] == away_team)]    
    away_matches = matches[(matches['HomeTeam'] == away_team) & (matches['AwayTeam'] == home_team)]  
    total_matches = pd.concat([home_matches, away_matches])
    
    # Get last n matches, if possible:
    try:    
        last_matches = (total_matches[total_matches.Date < date]
                        .sort_values(by = 'Date', ascending = False)
                        .iloc[0:n,:])
    except:  # If there are less than n matches
        last_matches = (total_matches[total_matches.Date < date]
                        .sort_values(by = 'Date', ascending = False)
                        .iloc[0:total_matches.shape[0],:])

    return last_matches


def get_goals(matches, team):
    '''
    Get total number of goals,a specfic team has scored 
    from a dataframe of specific matches
    '''
    home_goals = matches.FTHG[matches.HomeTeam == team].sum()
    away_goals = matches.FTAG[matches.AwayTeam == team].sum()

    return home_goals + away_goals

def get_concealed_goals(matches, team):
    '''
    Get all the goals, concealed of a specfic team from a dataframe of specific matches
    '''
    home_goals = matches.FTAG[matches.HomeTeam == team].sum()
    away_goals = matches.FTHG[matches.AwayTeam == team].sum()

    return home_goals + away_goals

def get_wins(matches, team):
    '''
    Get the number of wins of a specfic team from a dataframe of specific matches.
    '''
    home_wins = matches[(matches.FTR == 1) & (matches.HomeTeam == team)].shape[0]
    away_wins = matches[(matches.FTR == -1) & (matches.AwayTeam == team)].shape[0]

    return home_wins + away_wins


def coefficients_to_probability(matches):
    '''
    Converts betting platform coefficient(1 < x) with % of income
    into a probability coefficient(0 < x < 1)
    '''
    # How many profit betting companies make on bets
    matches['profit_B365'] = sum((1/matches['B365H'], 1/matches['B365D'], 1/matches['B365A']))
    matches['profit_BbAv'] = sum((1/matches['BbAvA'], 1/matches['BbAvD'], 1/matches['BbAvH']))

    # Converting all betting coefficients into probabilities of homw/draw/away:
    for betting_column in ['B365H', 'B365D', 'B365A', 'BbAvH', 'BbAvD', 'BbAvA']:
        matches[betting_column] = 1 / (matches[betting_column] * matches['profit_' + betting_column[:-1]])

    return matches
# get_n_last_matches_against_each_other(matches, '2020-12-15', 'Liverpool', 'Tottenham', 10)


In [4]:
# Create features, based on which, the model would train and predict results

def get_features_for_match(match, matches, n1=10, n2=3):
    '''
    Creates a special set of features for each match, if possible(10 last matches
    and 3 last matches against each other)
    '''
    match_date = match.Date
    home_team = match.HomeTeam
    away_team = match.AwayTeam
    # Get n1 last matches of 2 teams
    home_last = get_n_last_matches(matches, match_date, home_team, n=n1)
    away_last = get_n_last_matches(matches, match_date, away_team, n=n1)
    # Get last n2 matches against each other
    home_last_against = get_n_last_matches_against_each_other(matches, match_date, home_team, away_team, n=n2)
    away_last_against = get_n_last_matches_against_each_other(matches, match_date, away_team, home_team, n=n2)
    # Goals stuff
    home_goals = get_goals(home_last, home_team)
    away_goals = get_goals(away_last, away_team)
    home_goals_conceided = get_concealed_goals(home_last, home_team)
    away_goals_conceided = get_concealed_goals(away_last, away_team)

    res = pd.DataFrame()
    res.loc[0, 'H_goal_diff'] = home_goals - home_goals_conceided
    res.loc[0, 'A_goal_diff'] = away_goals - away_goals_conceided
    res.loc[0, 'H_win'] = get_wins(home_last, home_team) 
    res.loc[0, 'A_win'] = get_wins(away_last, away_team)
    res.loc[0, 'H_win_against'] = get_wins(home_last_against, home_team)
    res.loc[0, 'A_win_against'] = get_wins(away_last_against, away_team)
    # TODO ПОПРООБУВАТИ ЩЕ ЯКІСЬ КРИТЕРІЇ ПОТЕСТУВАТИ
#     print(result.loc[0])
    return res.loc[0]


def create_features(matches):
    '''
    Iterate throu all matches, create features for every single of them
    if possible and aggregate them together
    '''
    print('Generating features... Please wait for one or two minutes')
    # Creates dataframe with features for all matches
    matches_features = matches.apply(lambda x: get_features_for_match(x, matches, n1=20, n2=3), axis=1)
    return matches_features

def explore_features(features, inputs, path):
    '''
    Explore data by plotting KDE graphs.
    '''
    fig = plt.figure(1)
    fig.subplots_adjust(bottom= -1, left=0.025, top = 2, right=0.975)
    #Loop through features    
    i = 1
    for col in features.columns:
        sns.set_style("whitegrid")
        sns.set_context("paper", font_scale = 0.5, rc={"lines.linewidth": 1})
        plt.subplot(5,5,0 + i)
        j = i - 1
        #Plot KDE for all labels
        try:
            sns.distplot(inputs[inputs['label'] == 'H'].iloc[:,j], hist = False, label = 'Home')
            sns.distplot(inputs[inputs['label'] == 'D'].iloc[:,j], hist = False, label = 'Draw')
            sns.distplot(inputs[inputs['label'] == 'A'].iloc[:,j], hist = False, label = 'Away')
            plt.legend();
        except ValueError as e:
            print(e)
        i = i + 1
    #Define plot format    
    DefaultSize = fig.get_size_inches()
    fig.set_size_inches((DefaultSize[0]*1.2, DefaultSize[1]*1.2))
    plt.show()
    #Compute and print label weights
    labels = inputs.loc[:,'label']
    class_weights = labels.value_counts() / len(labels)
    print(class_weights)
    #Store description of all features
    feature_details = features.describe().transpose()
    #Return feature details
    return feature_details

In [5]:
# Getting features ready

# If you want to create features instead of reading them from file, uncomment:

# time_start = time.time()
# features = create_features(matches)
# print('Generated features in', time.time() - time_start, 'sec.')
# features.iloc[100:].to_csv('data/features.csv')

# Reading files from the file
features = pd.read_csv('data/features.csv')
features = features.drop('Unnamed: 0', axis=1)




### Printing statistics to see the correlations:
```python3
statistic = explore_features(features, inputs, '/soccer/')
statistic
```

In [6]:
# Determining labels, which we want to train on, and then predict them

labels = matches.loc[:, 'HTR']
labels.name = 'label'
labels = labels.iloc[100:]
# coefficients = coefficients_to_probability(matches)[['BbAvH','BbAvD','BbAvA']]
# features = pd.merge(features, coefficients, left_index=True, right_index=True)
# statistic = explore_features(features, features.deo, '/soccer/')
# statistic

labels


100     D
101     D
102     H
103     H
104     A
       ..
3795    H
3796    A
3797    H
3798    H
3799    A
Name: label, Length: 3700, dtype: object

# Machine learning part 


In [19]:
# Splitting the data into training and testing parts
# Standardising the data.
from sklearn.preprocessing import scale

#Center to the mean and component wise scale to unit variance.

X_train, X_test, y_train, y_test = train_test_split(features, labels, 
                                                    test_size = 400,
                                                    random_state = 2,
                                                    stratify = labels)

# Classifiers
# Defyning classifiers:

clf_A = LogisticRegression(random_state = 42, max_iter=1200000)
clf_B = SVC(random_state = 912, kernel='rbf', max_iter=1200000)
clf_C = xgb.XGBClassifier(seed = 82)

# After maxyzizing, the following parameters proved to behave the best:
clf_boosted = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.4, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=3, missing=None, n_estimators=40, nthread=-1,
       objective='binary:logistic', reg_alpha=1e-05, reg_lambda=1,
       scale_pos_weight=1, seed=2, silent=True, subsample=0.8)

In [32]:
from sklearn.metrics import f1_score

def train_classifier(clf, X_train, y_train):
    ''' 
    Fits and overall trains a classifier to the training data. 
    '''
    
    # Start the clock, train the classifier, then stop the clock
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    
    # Print the results
    print(f"Trained model in {end - start} seconds")

    
def predict_labels(clf, features, target):
    '''
    Makes predictions using a fit classifier based on F1 score. 
    '''
    
    # Start the clock, make predictions, then stop the clock
    start = time.time()
    y_pred = clf.predict(features)
    
    end = time.time()
    # Print and return results
    print(f"Predicted in {end - start} seconds")
    
    return f1_score(target, y_pred, pos_label='H'), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
    
    # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predi


    
def predict_labels(clf, features, target):
    '''
    Makes predictions using a fit classifier based on F1 score. 
    '''
    
    # Start the clock, make predictions, then stop the clock
    start = time.time()
    y_pred = clf.predict(features)
    
    end = time.time()
    # Print and return results
    print(f"Predicted in {end - start} seconds")
#     print(f1_score(target, y_pred, pos_label=1))
    return f1_score(target, y_pred, pos_label=1), sum(target == y_pred) / float(len(y_pred))


def train_predict(clf, X_train, y_train, X_test, y_test):
    ''' Train and predict using a classifer based on F1 score. '''
    
    # Indicate the classifier and the training set size
    print(f"Training a {clf.__class__.__name__} using a training set size of {len(X_train)}")
    
 # Train the classifier
    train_classifier(clf, X_train, y_train)
    
    # Print the results of prediction for both training and testing
    f1, acc = predict_labels(clf, X_train, y_train)
    print(f"Accyracy on training set: {acc}.")
    
    f1, acc = predict_labels(clf, X_test, y_test)
    print(f"Accyracy on test set: {acc}.")




In [33]:
# y_train = y_train.apply(lambda x: 1 if x == 'H' else 0)
# y_test = y_test.apply(lambda x: 1 if x == 'H' else 0)

y_test[:50]

2447    0
2316    0
3250    0
3673    0
2241    1
2933    1
2171    1
313     0
2892    1
3068    0
2423    0
1087    0
102     1
3308    0
602     1
1879    0
1214    0
126     1
363     1
1003    1
1215    0
1886    0
2193    0
3287    0
2701    1
258     0
1569    1
1483    0
2525    0
1260    1
851     0
959     1
435     0
3793    0
2279    1
2475    0
1836    0
755     1
2471    0
2975    0
2234    0
3428    1
2686    0
2705    1
3532    1
326     0
677     0
3765    0
496     0
615     1
Name: label, dtype: int64

In [34]:




# This are developer tools, uncomment them and play around if you want to
train_predict(clf_A, X_train, y_train, X_test, y_test)
print()
train_predict(clf_B, X_train, y_train, X_test, y_test)
print()
train_predict(clf_C, X_train, y_train, X_test, y_test)
print()
clf_A.fit(X_train, y_train)
y_pred = clf_A.predict(X_test)


y_pred  
#     end = time.time()
#     # Print and return results
#     print(f"Predicted in {end - start} seconds")
# #     print(f1_score(target, y_pred, pos_label=1))
#     print(y_pred == target)



# I will save the model after training:
# with open('models/finalized_model.sav', 'w') as f:
#     pass

# Main training of the model
# start = time.time()
# print('training home model')
# clf_boosted.fit(X_train, y_train)
# print(f"took {time.time() - start}")
# start = time.time()



# # save the models to disk
# pickle.dump(clf_boosted, open('models/finalized_model.sav', 'wb'))
# y_train

Training a LogisticRegression using a training set size of 3300
Trained model in 0.020256996154785156 seconds
Predicted in 0.0015742778778076172 seconds
F1 score____training set: 0.32543640897755616 , 0.6721212121212121.
Predicted in 0.0012569427490234375 seconds
F1 score____test set: 0.2717391304347826 , 0.665.

Training a SVC using a training set size of 3300
Trained model in 0.3087122440338135 seconds
Predicted in 0.11039113998413086 seconds
F1 score____training set: 0.27437794216543376 , 0.673030303030303.
Predicted in 0.014837265014648438 seconds
F1 score____test set: 0.20118343195266275 , 0.6625.

Training a XGBClassifier using a training set size of 3300
Trained model in 0.07891535758972168 seconds
Predicted in 0.00472712516784668 seconds
F1 score____training set: 0.7652700656234225 , 0.8590909090909091.
Predicted in 0.0010986328125 seconds
F1 score____test set: 0.3348017621145374 , 0.6225.



array([0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,

In [12]:
# load the model from disk
clf_boosted = pickle.load(open('models/finalized_model.sav', 'rb'))

# make a prediction

to_predict = X_test.iloc[-16:-15]

print(to_predict)

away_win, draw, home_win = clf_boosted.predict_proba(to_predict)[0]

print('Away win: ', away_win)
print('Draw: ', draw)
print('Home win: ', home_win)


      H_goal_diff  A_goal_diff  H_win  A_win  H_win_against  A_win_against
2553        -17.0         30.0    4.0   13.0            1.0            2.0
Away win:  0.49163887
Draw:  0.37716013
Home win:  0.13120104


### Методом спроб і помилок я визначив, що під цю проблему найкраще підходить XGBClassifier. 


In [31]:




# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import sklearn


# TODO: Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.1],
               'n_estimators' : [40],
               'max_depth': [3],
               'min_child_weight': [3],
               'gamma':[0.4],
               'subsample' : [0.8],
               'colsample_bytree' : [0.8],
               'scale_pos_weight' : [1],
               'reg_alpha':[1e-5]
             }  

# TODO: Initialize the classifier
clf = xgb.XGBClassifier(seed=2)

# TODO: Make an f1 scoring function using 'make_scorer' 
f1_scorer = make_scorer(f1_score,pos_label=1)

# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
                        scoring=f1_scorer,
                        param_grid=parameters,
                        cv=5)

# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)

# Get the estimator
clf = grid_obj.best_estimator_
# print(clf)

# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
    
# f1, acc = predict_labels(clf, X_test, y_test)
# print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))



Predicted in 0.0018160343170166016 seconds


TypeError: unsupported format string passed to numpy.ndarray.__format__