# ML on Defenders

In [21]:
relevant_forward_keys = ['togga_score', 'apps', 'tackle_total', 'yellow_cards', 'dispossessed_total', 'red_cards', 
                         'minutes', 'successful_dribbles', 'own_goals', 'interception_total', 
                         'goals_scored', 'key_passes', 'accurateCrosses_total', 'aerialWon_total', 'assists', 
                         'points_above_replacement', 'name', 'age', 'ontarget_scoring_att', 'next_year']

relevant_midfield_keys = ['togga_score', 'apps', 'tackle_total', 'yellow_cards', 'dispossessed_total', 'red_cards', 
                          'minutes', 'successful_dribbles', 'own_goals', 'interception_total', 'ontarget_scoring_att',
                          'goals_scored', 'key_passes', 'accurateCrosses_total', 'clean_sheets', 'aerialWon_total', 
                          'assists', 'points_above_replacement', 'name', 'age', 'next_year']

relevant_defense_keys = ['goals_conceded', 'togga_score', 'apps', 'tackle_total', 'yellow_cards', 'dispossessed_total',
                         'red_cards', 'minutes', 'successful_dribbles', 'own_goals', 'name', 'ontarget_scoring_att',
                         'interception_total', 'goals_scored', 'key_passes', 'accurateCrosses_total', 'clean_sheets', 
                         'aerialWon_total', 'assists', 'points_above_replacement', 'clearance_total', 'age', 'next_year']

relevant_goalie_keys = ['goals_conceded', 'saves', 'togga_score', 'apps', 'tackle_total', 'yellow_cards', 'red_cards', 
                        'minutes', 'own_goals', 'interception_total', 'key_passes', 'clean_sheets', 'aerialWon_total', 
                        'penalties_saved', 'points_above_replacement', 'name', 'clearance_total', 'age', 'next_year']

In [22]:
import json
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [23]:
with open("Data/positional.json", "r") as infile:
    data = json.load(infile)

#We will do analysis on Defenders
position = '2'
players = []
for player in data[position]:
    players.append(pd.DataFrame([player['history']['2015/16']]))
    players.append(pd.DataFrame([player['history']['2014/15']]))
    players.append(pd.DataFrame([player['history']['2013/14']]))
    players.append(pd.DataFrame([player['history']['2012/13']]))

players = pd.concat(players)
filter = players['togga_score'] > 0
players = players[filter]
keys = relevant_defense_keys.copy()
keys.remove('points_above_replacement')
players = players[keys]
players = players.dropna(axis=1,how='all')
players = players.dropna(axis=0,how='any')

players = players.set_index("name")
players.index.name = None
keys.remove('name')
    

In [24]:
import matplotlib.pyplot as plt
# box and whisker plots of some of the stats
plot_keys = ['togga_score', 'goals_scored', 'assists', 'ontarget_scoring_att', 'aerialWon_total', 'minutes', 
             'successful_dribbles', 'age', 'next_year']
players[plot_keys].plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

In [25]:
# histograms
players[plot_keys].hist()
plt.show()

## Linear Regressor

In [26]:
x_keys = keys.copy()
x_keys.remove('next_year')
X = players[x_keys]
Y = players['next_year']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=324)
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
Y_prediction = regressor.predict(X_test)
#X_test, y_prediction

RMSE = sqrt(mean_squared_error(y_true = Y_test, y_pred = Y_prediction))
print(RMSE)

125.8588994987071


## Decision Tree Regressor

In [27]:
regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(X_train, Y_train)
y_prediction = regressor.predict(X_test)
#y_prediction
#X_train.head()

RMSE = sqrt(mean_squared_error(y_true = Y_test, y_pred = y_prediction))
print(RMSE)

145.68400986095443


## Classification ML

In [28]:
from pandas.tools.plotting import scatter_matrix
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [29]:
players["category"] = players["togga_score"]
for index, row in players.iterrows():
    if row['next_year'] > 0:
        ratio = row['togga_score'] / abs(row['next_year'])
        if  ratio < 0.8:
            val = -1
        elif ratio < 1.2:
            val = 0
        else:
            val = 1
        #print(ratio,val)
    else:
        #row[x_keys] = 0
        #forwards.drop(index, inplace=True)
        val = -10
    players.set_value(index,'category',val)

    
x_keys = keys.copy()
x_keys.remove('next_year')
x_keys.append("category")
filter = players['next_year'] > 0
players = players[filter]
array = players[x_keys].values
X = array[:,:-1]
Y = array[:,-1].astype(int)
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

seed = 7
scoring = 'accuracy'

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = cross_validation.KFold(len(X_train), n_folds=10, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.336250 (0.102504)
LDA: 0.355000 (0.078102)
KNN: 0.315833 (0.077478)
CART: 0.361667 (0.078125)
NB: 0.315000 (0.121209)
SVM: 0.412500 (0.133814)


In [30]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Predictions

In [40]:
new_data = []
for player in data[position]:
    new_data.append(pd.DataFrame([player['history']['2016/17']]))

new_data = pd.concat(new_data)
filter = new_data['togga_score'] > 0
new_data = new_data[filter]
new_data = new_data.dropna(axis=1,how='all')

All_predictions = []
All_predictions.append(new_data['name'].values.astype('str'))
new_data = new_data.set_index("name")
new_data.index.name = None
new_keys = keys.copy()
new_keys.remove('next_year')
new_data = new_data[new_keys]
new_data = new_data.dropna(axis=0,how='any')
new_X = new_data.values

# Make predictions on validation dataset
def makePredictions(model):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))
    predictions = model.predict(new_X)
    #for name, pred in zip(new_data.index.values, predictions):
        #print(name,pred)
    All_predictions.append(predictions)

## Decision Tree Predictions

In [41]:
makePredictions(DecisionTreeClassifier())

0.307692307692
[[0 1 1 1]
 [0 4 3 3]
 [2 5 6 4]
 [1 4 2 2]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         3
         -1       0.29      0.40      0.33        10
          0       0.50      0.35      0.41        17
          1       0.20      0.22      0.21         9

avg / total       0.34      0.31      0.31        39



## LDA Predictions

In [42]:
makePredictions(LinearDiscriminantAnalysis())

0.538461538462
[[ 1  0  2  0]
 [ 0  7  3  0]
 [ 0  2 10  5]
 [ 0  1  5  3]]
             precision    recall  f1-score   support

        -10       1.00      0.33      0.50         3
         -1       0.70      0.70      0.70        10
          0       0.50      0.59      0.54        17
          1       0.38      0.33      0.35         9

avg / total       0.56      0.54      0.54        39



## Gaussian NB Predictions

In [43]:
makePredictions(GaussianNB())

0.282051282051
[[1 1 1 0]
 [3 2 3 2]
 [7 1 6 3]
 [2 1 4 2]]
             precision    recall  f1-score   support

        -10       0.08      0.33      0.12         3
         -1       0.40      0.20      0.27        10
          0       0.43      0.35      0.39        17
          1       0.29      0.22      0.25         9

avg / total       0.36      0.28      0.30        39



## Logistic Regression Predictions

In [44]:
makePredictions(LogisticRegression())

0.564102564103
[[ 0  0  3  0]
 [ 0  7  3  0]
 [ 0  2 12  3]
 [ 0  1  5  3]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         3
         -1       0.70      0.70      0.70        10
          0       0.52      0.71      0.60        17
          1       0.50      0.33      0.40         9

avg / total       0.52      0.56      0.53        39



  'precision', 'predicted', average, warn_for)


## KNN Predictions

In [45]:
makePredictions(KNeighborsClassifier())

0.358974358974
[[1 1 1 0]
 [0 3 4 3]
 [1 4 8 4]
 [0 2 5 2]]
             precision    recall  f1-score   support

        -10       0.50      0.33      0.40         3
         -1       0.30      0.30      0.30        10
          0       0.44      0.47      0.46        17
          1       0.22      0.22      0.22         9

avg / total       0.36      0.36      0.36        39



## SVC Prediction

In [46]:
makePredictions(SVC())

0.435897435897
[[ 0  0  3  0]
 [ 0  0 10  0]
 [ 0  0 17  0]
 [ 0  0  9  0]]
             precision    recall  f1-score   support

        -10       0.00      0.00      0.00         3
         -1       0.00      0.00      0.00        10
          0       0.44      1.00      0.61        17
          1       0.00      0.00      0.00         9

avg / total       0.19      0.44      0.26        39



  'precision', 'predicted', average, warn_for)


In [47]:
from itertools import chain
chain = chain.from_iterable(zip(*All_predictions))
preds = list(chain)
all_preds = 'Name\tCART\tLDA\tGNB\tLR\tKNN\tSVC\n'
for i in range(int(len(preds)/7)):
    #print(preds[i*7], preds[i*7+1:i*7+7])
    all_preds+=(preds[i*7])
    for j in range(i*7+1, i*7+7):
        all_preds+='\t' + str(preds[j])
    all_preds+='\n'
#print(all_preds) 
all_preds.encode("utf-8")
with open("Data/Predictions/Defense.txt", "w") as f:
    f.write(all_preds)

In [48]:
import csv

txt_file = r"Data/Predictions/Defense.txt"
csv_file = r"Data/Predictions/Defense.csv"

in_txt = csv.reader(open(txt_file, "r"), delimiter = '\t')
out_csv = csv.writer(open(csv_file, 'w', newline=''))
out_csv.writerows(in_txt)
del out_csv
del in_txt