# Naive match winner prediction from past victory data
We implement a logistic regression to predict the result of a match based on previous matches that the two teams have competed in this season

In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict
matches_england = pd.read_json("../data/figshare/matches_England.json")


def extract_match_results(matches):
    def f(row):
        teams, scores = row['label'].strip().split(',')
        t1, t2 = teams.strip().split(' - ')
        s1, s2 = map(int, scores.strip().split(' - '))
        date = row['dateutc']
        return {'s1': s1, 's2': s2, 'date': date, t1: 1, t2: -1}

    results = matches.apply(f, axis=1)

    results = pd.DataFrame(list(results))
    results = results.sort_values(by='date', axis=0).reset_index()
    results = results.fillna(0)

    return results



results = extract_match_results(matches_england)
teamnames = [team for team in results if team not in ['s1', 's2', 'date', 'index']]

for i in range(1, len(results)):
    if results['date'][i] < results['date'][i-1]:
        print(i)
    

In [5]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error as mse, log_loss

y, X = (results['s1'] - results['s2']).values, results[teamnames].values
tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = LogisticRegression(random_state=23).fit(X_train, (y_train > 0))
    print("Logistic regression accuracy: ", clf.score(X_test, y_test > 0))

Logistic regression accuracy:  0.6825396825396826
Logistic regression accuracy:  0.7619047619047619
Logistic regression accuracy:  0.6349206349206349
Logistic regression accuracy:  0.6507936507936508
Logistic regression accuracy:  0.49206349206349204


In [13]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error as mse, log_loss

y, X = (results['s1'] - results['s2']).values, results[teamnames].values
tscv = TimeSeriesSplit(n_splits=5)

C_values = 10**np.linspace(-2, 2,num=20)
scores = []


for C in C_values:
    score = []
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = LogisticRegression(C=C, random_state=23).fit(X_train, (y_train > 0))
        score.append(clf.score(X_test, y_test > 0))
        # print("Logistic regression accuracy: ", clf.score(X_test, y_test > 0))
    scores.append(score)


# a = np.array(list(zip(*scores)))

for i, score in enumerate(scores):
    print(str(C_values[i])[:5], [str(s)[:5] for s in score])
# print(scores)

0.01 ['0.507', '0.571', '0.571', '0.492', '0.539']
0.016 ['0.507', '0.571', '0.571', '0.492', '0.571']
0.026 ['0.507', '0.603', '0.619', '0.523', '0.603']
0.042 ['0.507', '0.650', '0.666', '0.650', '0.603']
0.069 ['0.507', '0.698', '0.666', '0.634', '0.571']
0.112 ['0.571', '0.730', '0.666', '0.634', '0.523']
0.183 ['0.619', '0.682', '0.650', '0.650', '0.523']
0.297 ['0.619', '0.730', '0.650', '0.650', '0.507']
0.483 ['0.603', '0.761', '0.634', '0.650', '0.492']
0.784 ['0.682', '0.761', '0.634', '0.634', '0.492']
1.274 ['0.698', '0.761', '0.634', '0.650', '0.492']
2.069 ['0.698', '0.761', '0.634', '0.650', '0.492']
3.359 ['0.730', '0.746', '0.634', '0.650', '0.492']
5.455 ['0.730', '0.746', '0.634', '0.650', '0.492']
8.858 ['0.730', '0.746', '0.634', '0.650', '0.492']
14.38 ['0.698', '0.730', '0.634', '0.650', '0.492']
23.35 ['0.714', '0.730', '0.634', '0.650', '0.492']
37.92 ['0.698', '0.730', '0.634', '0.650', '0.492']
61.58 ['0.682', '0.730', '0.634', '0.650', '0.492']
100.0 ['0.682