# ML on Midfielders

In [176]:
relevant_forward_keys = ['togga_score', 'apps', 'tackle_total', 'yellow_cards', 'dispossessed_total', 'red_cards', 
                         'minutes', 'successful_dribbles', 'own_goals', 'interception_total', 
                         'goals_scored', 'key_passes', 'accurateCrosses_total', 'aerialWon_total', 'assists', 
                         'points_above_replacement', 'name', 'age', 'ontarget_scoring_att', 'next_year']

relevant_midfield_keys = ['togga_score', 'apps', 'tackle_total', 'yellow_cards', 'dispossessed_total', 'red_cards', 
                          'minutes', 'successful_dribbles', 'own_goals', 'interception_total', 'ontarget_scoring_att',
                          'goals_scored', 'key_passes', 'accurateCrosses_total', 'clean_sheets', 'aerialWon_total', 
                          'assists', 'points_above_replacement', 'name', 'age', 'next_year']

relevant_defense_keys = ['goals_conceded', 'togga_score', 'apps', 'tackle_total', 'yellow_cards', 'dispossessed_total',
                         'red_cards', 'minutes', 'successful_dribbles', 'own_goals', 'name', 'ontarget_scoring_att',
                         'interception_total', 'goals_scored', 'key_passes', 'accurateCrosses_total', 'clean_sheets', 
                         'aerialWon_total', 'assists', 'points_above_replacement', 'clearance_total', 'age', 'next_year']

relevant_goalie_keys = ['goals_conceded', 'saves', 'togga_score', 'apps', 'tackle_total', 'yellow_cards', 'red_cards', 
                        'minutes', 'own_goals', 'interception_total', 'key_passes', 'clean_sheets', 'aerialWon_total', 
                        'penalties_saved', 'points_above_replacement', 'name', 'clearance_total', 'age', 'next_year']

In [177]:
import json
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [178]:
with open("Data/positional.json", "r") as infile:
    data = json.load(infile)

#We will do analysis on Midfielders
position = '3'
players = []
for player in data[position]:
    players.append(pd.DataFrame([player['history']['2015/16']]))
    players.append(pd.DataFrame([player['history']['2014/15']]))
    players.append(pd.DataFrame([player['history']['2013/14']]))
    players.append(pd.DataFrame([player['history']['2012/13']]))

players = pd.concat(players)
filter = players['togga_score'] > 0
players = players[filter]
keys = relevant_midfield_keys.copy()
keys.remove('points_above_replacement')
players = players[keys]
players = players.dropna(axis=1,how='all')
players = players.dropna(axis=0,how='any')

players = players.set_index("name")
players.index.name = None
keys.remove('name')
    

In [179]:
import matplotlib.pyplot as plt
# box and whisker plots of some of the stats
plot_keys = ['togga_score', 'goals_scored', 'assists', 'ontarget_scoring_att', 'aerialWon_total', 'minutes', 
             'successful_dribbles', 'age', 'next_year']
players[plot_keys].plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

In [180]:
# histograms
players[plot_keys].hist()
plt.show()

## Linear Regressor

In [181]:
x_keys = keys.copy()
x_keys.remove('next_year')
X = players[x_keys]
Y = players['next_year']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=324)
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_prediction = regressor.predict(X_test)
#X_test, y_prediction

RMSE = sqrt(mean_squared_error(y_true = Y_test, y_pred = Y_prediction))
print(RMSE)

125.52295927141816


## Decision Tree Regressor

In [182]:
regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(X_train, Y_train)
y_prediction = regressor.predict(X_test)
#y_prediction
#X_train.head()

RMSE = sqrt(mean_squared_error(y_true = Y_test, y_pred = y_prediction))
print(RMSE)

203.022851346872


## Classification ML

In [183]:
from pandas.tools.plotting import scatter_matrix
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [184]:
players["category"] = players["togga_score"]
for index, row in players.iterrows():
    if row['next_year'] > 0:
        ratio = row['togga_score'] / abs(row['next_year'])
        if  ratio < 0.8:
            val = -1
        elif ratio < 1.2:
            val = 0
        else:
            val = 1
        #print(ratio,val)
    else:
        #row[x_keys] = 0
        #forwards.drop(index, inplace=True)
        val = -10
    players.set_value(index,'category',val)

    
x_keys = keys.copy()
x_keys.remove('next_year')
x_keys.append("category")
filter = players['next_year'] > 0
players = players[filter]
array = players[x_keys].values
X = array[:,:-1]
Y = array[:,-1].astype(int)
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

seed = 7
scoring = 'accuracy'

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = cross_validation.KFold(len(X_train), n_folds=10, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.375500 (0.085095)
LDA: 0.379667 (0.089476)
KNN: 0.388167 (0.073119)
CART: 0.388500 (0.094556)
NB: 0.384167 (0.082369)
SVM: 0.417500 (0.094243)


In [185]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Predictions

In [197]:
new_data = []
for player in data[position]:
    new_data.append(pd.DataFrame([player['history']['2016/17']]))

new_data = pd.concat(new_data)
filter = new_data['togga_score'] > 0
new_data = new_data[filter]
new_data = new_data.dropna(axis=1,how='all')

All_predictions = []
All_predictions.append(new_data['name'].values.astype('str'))
new_data = new_data.set_index("name")
new_data.index.name = None
new_keys = keys.copy()
new_keys.remove('next_year')
new_data = new_data[new_keys]
new_data = new_data.dropna(axis=0,how='any')
new_X = new_data.values

# Make predictions on validation dataset
def makePredictions(model):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))
    predictions = model.predict(new_X)
    #for name, pred in zip(new_data.index.values, predictions):
        #print(name,pred)
    All_predictions.append(predictions)

## Decision Tree Predictions

In [198]:
makePredictions(DecisionTreeClassifier())

0.426229508197
[[ 0  1  0  1]
 [ 0  6  4  5]
 [ 0  2 14 10]
 [ 3  3  6  6]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         2
         -1       0.50      0.40      0.44        15
          0       0.58      0.54      0.56        26
          1       0.27      0.33      0.30        18

avg / total       0.45      0.43      0.44        61



## LDA Predictions

In [199]:
makePredictions(LinearDiscriminantAnalysis())

0.327868852459
[[ 0  0  0  2]
 [ 1  2  4  8]
 [ 0  7  8 11]
 [ 0  4  4 10]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         2
         -1       0.15      0.13      0.14        15
          0       0.50      0.31      0.38        26
          1       0.32      0.56      0.41        18

avg / total       0.35      0.33      0.32        61



## Gaussian NB Predictions

In [200]:
makePredictions(GaussianNB())

0.377049180328
[[ 0  1  0  1]
 [ 2  3  6  4]
 [ 3  3  9 11]
 [ 1  2  4 11]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         2
         -1       0.33      0.20      0.25        15
          0       0.47      0.35      0.40        26
          1       0.41      0.61      0.49        18

avg / total       0.40      0.38      0.38        61



## Logistic Regression Predictions

In [201]:
makePredictions(LogisticRegression())

0.311475409836
[[ 0  0  0  2]
 [ 1  1  4  9]
 [ 0  5  9 12]
 [ 1  4  4  9]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         2
         -1       0.10      0.07      0.08        15
          0       0.53      0.35      0.42        26
          1       0.28      0.50      0.36        18

avg / total       0.33      0.31      0.30        61



## KNN Predictions

In [202]:
makePredictions(KNeighborsClassifier())

0.327868852459
[[ 0  1  0  1]
 [ 0  4  5  6]
 [ 0  8 10  8]
 [ 0  7  5  6]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         2
         -1       0.20      0.27      0.23        15
          0       0.50      0.38      0.43        26
          1       0.29      0.33      0.31        18

avg / total       0.35      0.33      0.33        61



  'precision', 'predicted', average, warn_for)


## SVC Prediction

In [203]:
makePredictions(SVC())

0.295081967213
[[ 0  0  0  2]
 [ 0  0  0 15]
 [ 0  0  0 26]
 [ 0  0  0 18]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         2
         -1       0.00      0.00      0.00        15
          0       0.00      0.00      0.00        26
          1       0.30      1.00      0.46        18

avg / total       0.09      0.30      0.13        61



  'precision', 'predicted', average, warn_for)


In [204]:
from itertools import chain
chain = chain.from_iterable(zip(*All_predictions))
preds = list(chain)
all_preds = 'Name\tCART\tLDA\tGNB\tLR\tKNN\tSVC\n'
for i in range(int(len(preds)/7)):
    #print(preds[i*7], preds[i*7+1:i*7+7])
    all_preds+=(preds[i*7])
    for j in range(i*7+1, i*7+7):
        all_preds+='\t' + str(preds[j])
    all_preds+='\n'
#print(all_preds) 
all_preds.encode("utf-8")
with open("Data/Predictions/Midfield.txt", "w") as f:
    f.write(all_preds)

In [205]:
import csv

txt_file = r"Data/Predictions/Midfield.txt"
csv_file = r"Data/Predictions/Midfield.csv"

in_txt = csv.reader(open(txt_file, "r"), delimiter = '\t')
out_csv = csv.writer(open(csv_file, 'w', newline=''))
out_csv.writerows(in_txt)
del out_csv
del in_txt