In [67]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn import preprocessing, linear_model 
from matplotlib import pyplot as plt
import time

# Load training times
train_epl00 = pd.read_csv('./data/EPL00_01.csv', encoding="ISO-8859-1")
train_epl01 = pd.read_csv('./data/EPL01_02.csv', encoding="ISO-8859-1")
train_epl02 = pd.read_csv('./data/EPL02_03.csv', encoding="ISO-8859-1")

# Load testing files
test_epl03 = pd.read_csv('./data/EPL03_04.csv', encoding="ISO-8859-1")
test_epl04 = pd.read_csv('./data/EPL04_05.csv', encoding="ISO-8859-1")

# Concat all training and test files into one structure
df_train = pd.concat((train_epl00, train_epl01, train_epl02), axis=0, ignore_index=True)
df_test = pd.concat((test_epl03, test_epl04), axis=0, ignore_index=True)

In [68]:
# Number of records in training set
num_train = df_train.shape[0]
print(str(num_train) + " records read from multiple training files")

# Number of records in test set
num_test = df_test.shape[0]
print(str(num_test) + " records read from multiple test files")

1140 records read from multiple training files
760 records read from multiple test files


In [69]:
# Calculate points from FTR (Full time Result)
def points(set_type):
    ''' Follows the league distribution of points in response to a result '''
    set_type['Result'] = np.NaN
    
    for index, item in set_type['FTR'].iteritems():
        if (item == 'H'):
            set_type.set_value(index, 'Result', 3.0)
        elif (item == 'A'):
            set_type.set_value(index, 'Result', 0.0)
        else:
            set_type.set_value(index, 'Result', 1.0)

# Calculate goal difference
def diff(set_type, col1, col2, new_col):
    ''' Difference of two columns col1 - col2 = new_col '''
    set_type[new_col] = np.NaN # Set all values to zero
    home_g = set_type[col1] # Placeholder values
    away_g = set_type[col2]
    set_type[new_col] = home_g - away_g # Calculate the difference in fulltime scores

# Calculate points - Result
points(df_train)
points(df_test)

# Calculate full time goal difference
diff(df_train, 'FTHG', 'FTAG', 'FTGD')
diff(df_test, 'FTHG', 'FTAG', 'FTGD')

# Calculate half time goal difference
diff(df_train, 'HTHG', 'HTAG', 'HTGD')
diff(df_test, 'HTHG', 'HTAG', 'HTGD')

#: Calculate shot taken on target difference - SOTD 
diff(df_train, 'HST', 'AST', 'SOTD')
diff(df_test, 'HST', 'AST', 'SOTD')

#: Calculate shots taken overall difference - STD 
diff(df_train, 'HS', 'AS', 'STD')
diff(df_test, 'HS', 'AS', 'STD')
    

#: Isolate Arsenal FC to review season performance
arsenal_str = 'Arsenal'      
arsenal_train = df_train.loc[(df_train['HomeTeam'] == arsenal_str) | (df_train['AwayTeam'] == arsenal_str)]
arsenal_test = df_test.loc[(df_test['HomeTeam'] == arsenal_str) | (df_test['AwayTeam'] == arsenal_str)]

In [70]:
#: These columns were dropped so that we could extract values!
drop_col = ['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'HTR', 'Result', 'FTHG', 'FTAG', 'HTHG', 'HTAG', \
            'HR', 'AR', 'HY', 'AY', 'HST', 'AST', 'HS', 'AS']
ars_train_del = arsenal_train.drop(drop_col, axis = 1)
ars_test_del = arsenal_test.drop(drop_col, axis = 1)

#: See portion of matrix we are dealing with
ars_train_del[:4]

Unnamed: 0,FTGD,HTGD,SOTD,STD
7,1,0,-5,-6
10,2,1,8,10
19,2,-1,5,11
34,0,1,0,-1


In [76]:
#: Normalize values?

#: Transform to numpy array
ars_train_val = ars_train_del.values.astype(float)
ars_test_val = ars_test_del.values.astype(float)

#: Activate sklearn - normalize object - skipping for now since 
#: Linear regression object does normalizing
# min_max_scaler = preprocessing.MinMaxScaler()
# ars_scaled = min_max_scaler.fit_transform(ars_val)

print('Number of samples in training set:' , ars_train_val.shape[0])
print('Number of samples in testing set:', ars_test_val.shape[0])

Number of samples in training set: 114
Number of samples in testing set: 76


In [77]:
#: Do linear regression testing on goal_diff (last feature)
# ars_X  = ars_train_val[:, np.newaxis, -2]

# vector y in linear regr
target_train = np.array(arsenal_train['Result'])
target_test = np.array(arsenal_test['Result'])

#: Split the data into training/testing sets
ars_X_train = ars_train_val
ars_X_test = ars_test_val

# # Split the targets into training/testing sets
ars_y_train = target_train
ars_y_test = target_test

In [133]:
# Create a Linear Regression object
regr = linear_model.LinearRegression(normalize=True)

# Train the model using the training sets
regr.fit(ars_X_train, ars_y_train)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean square error
print("Residual sum of squares: %.2f" % np.mean((regr.predict(ars_X_test) - ars_y_test)** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % regr.score(ars_X_test, ars_y_test))


Coefficients: 
 [  6.09389788e-01  -1.47178500e-01   1.61751172e-02  -5.54863767e-04]
Residual sum of squares: 0.49
Variance score: 0.71
