In [1]:
import pandas as pd
import numpy as np
from numpy import random 
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [28]:
#Loading in a dataframe of match results and season results and cleaning columns
original_df = pd.read_csv('EPL_Match_Results.csv')
original_df = original_df[['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','Season']]
original_df['Date'] = pd.to_datetime(original_df['Date'],dayfirst=True)
#Adding random results with a random integer and changing all the 2's to 3's
original_df['randomResult'] = np.random.randint(0,3,len(original_df))

away_win = original_df['randomResult'] == 0
draw = original_df['randomResult'] == 1
home_win = original_df['randomResult'] == 2
                  
original_df.loc[home_win,'randomResult'] = 'H'
original_df.loc[draw, 'randomResult'] = 'D'
original_df.loc[away_win, 'randomResult'] = 'A'

seasons = pd.read_csv('seasons_result.csv',index_col= 'year')
seasons = seasons.loc['2016/2017':'1993/1994'][['Team','W','L','D','F','A','Pts','Pos']]
seasons = seasons.reset_index()
seasons['year'] = seasons['year'].str.slice_replace(4,7,'-')
seasons['Team'] = (seasons['Team'].str.replace('Manchester United', 'Man United'))
seasons['Team'] = (seasons['Team'].str.replace('Manchester City', 'Man City'))
seasons['Team'] = seasons['Team'].str.replace('Tottenham Hotspurs', 'Tottenham')
seasons['year'] = seasons['year'].str.replace('2002-','2002-03')
seasons['year'] = seasons['year'].str.replace('2003-','2003-04')
seasons['year'] = seasons['year'].str.replace('2004-','2004-05')
seasons['year'] = seasons['year'].str.replace('2005-','2005-06')

team_list = []
team_list = original_df.HomeTeam.unique()
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 1000)
print(original_df.head(10))

        Date          HomeTeam        AwayTeam  FTHG  FTAG FTR   Season randomResult
0 1993-08-14           Arsenal        Coventry     0     3   A  1993-94            H
1 1993-08-14       Aston Villa             QPR     4     1   H  1993-94            H
2 1993-08-14           Chelsea       Blackburn     1     2   A  1993-94            H
3 1993-08-14         Liverpool  Sheffield Weds     2     0   H  1993-94            H
4 1993-08-14          Man City           Leeds     1     1   D  1993-94            D
5 1993-08-14         Newcastle       Tottenham     0     1   A  1993-94            D
6 1993-08-14            Oldham         Ipswich     0     3   A  1993-94            D
7 1993-08-14  Sheffield United         Swindon     3     1   H  1993-94            H
8 1993-08-14       Southampton         Everton     0     2   A  1993-94            H
9 1993-08-14          West Ham       Wimbledon     0     2   A  1993-94            D


The following cell turns the dataframe I originally loaded into the format I need for analyzing matches. I had to copy the original and make new columns for both so that when I concatenated them together I could have all of a team's games represented in order by season.

In [15]:
original_df1 = original_df.copy()
original_df1['team'] = original_df1['HomeTeam']
original_df1['opponent'] = original_df1['AwayTeam']
original_df['team'] = original_df1['AwayTeam']
original_df['opponent'] = original_df1['HomeTeam']
double_df = pd.concat([original_df,original_df1])
double_df = results_and_points(double_df, team_list)
double_df.set_index(['Season','team','Date'], inplace=True)
double_df.sort_index(inplace=True, level = [0,1,2])
double_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,randomResult,opponent,Points,Random Points
Season,team,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1993-94,Arsenal,1993-08-14,Arsenal,Coventry,0,3,A,A,Coventry,0.0,0.0
1993-94,Arsenal,1993-08-16,Tottenham,Arsenal,0,1,A,A,Tottenham,3.0,3.0
1993-94,Arsenal,1993-08-21,Sheffield Weds,Arsenal,0,1,A,H,Sheffield Weds,3.0,0.0
1993-94,Arsenal,1993-08-24,Arsenal,Leeds,2,1,H,A,Leeds,3.0,0.0
1993-94,Arsenal,1993-08-28,Arsenal,Everton,2,0,H,H,Everton,3.0,3.0


In [6]:
double_df = all_time_win_ratio(double_df)
double_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,randomResult,opponent,Points,Random Points,opp_win_ratio
Season,team,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1993-94,Arsenal,1993-08-14,Arsenal,Coventry,0,3,A,A,Coventry,0.0,0.0,0.275641
1993-94,Arsenal,1993-08-16,Tottenham,Arsenal,0,1,A,D,Tottenham,3.0,1.0,0.424843
1993-94,Arsenal,1993-08-21,Sheffield Weds,Arsenal,0,1,A,H,Sheffield Weds,3.0,0.0,0.313869
1993-94,Arsenal,1993-08-24,Arsenal,Leeds,2,1,H,H,Leeds,3.0,3.0,0.415493
1993-94,Arsenal,1993-08-28,Arsenal,Everton,2,0,H,D,Everton,3.0,1.0,0.362213


In [16]:
all_games = win_ratio_head2head(double_df)
all_games.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,randomResult,opponent,Points,Random Points,head2head_ratio
Season,team,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-18,Huddersfield,2017-08-12,Crystal Palace,Huddersfield,0,3,A,D,Crystal Palace,3.0,1.0,0.5
2017-18,Huddersfield,2017-08-20,Huddersfield,Newcastle,1,0,H,A,Newcastle,3.0,0.0,0.5
2017-18,Huddersfield,2017-08-26,Huddersfield,Southampton,0,0,D,D,Southampton,1.0,1.0,0.0
2017-18,Huddersfield,2017-09-11,West Ham,Huddersfield,2,0,H,D,West Ham,0.0,1.0,0.0
2017-18,Huddersfield,2017-09-16,Huddersfield,Leicester,1,1,D,H,Leicester,1.0,3.0,0.0


In [3]:
##function to filter a team's results by w/d/l and add the points to the df
def results_and_points(df, team_list):
    for team in team_list:    
        real_win = ((df['team'] == team) & (df['HomeTeam'] == team) & (df['FTR'] =='H')) |((df['team'] == team) & (df['AwayTeam'] ==team) & (df['FTR'] =='A'))
        real_tie = ((df['team'] == team) & (df['HomeTeam'] == team) & (df['FTR'] =='D')) |((df['team'] == team) & (df['AwayTeam'] ==team) & (df['FTR'] =='D'))
        real_loss = ((df['team'] == team) & (df['HomeTeam'] == team) & (df['FTR'] =='A')) |((df['team'] == team) & (df['AwayTeam'] ==team) & (df['FTR'] =='H'))
        random_win = ((df['team'] == team) & (df['HomeTeam'] == team) & (df['randomResult'] =='H')) |((df['team'] == team) & (df['AwayTeam'] ==team) & (df['randomResult'] =='A'))
        random_tie = ((df['team'] == team) & (df['HomeTeam'] == team) & (df['randomResult'] =='D')) |((df['team'] == team) & (df['AwayTeam'] ==team) & (df['randomResult'] =='D'))
        random_loss = ((df['team'] == team) & (df['HomeTeam'] == team) & (df['randomResult'] =='A')) |((df['team'] == team) & (df['AwayTeam'] ==team) & (df['randomResult'] =='H'))
        df.loc[real_win,'Points']=3
        df.loc[real_tie,'Points']=1
        df.loc[real_loss, 'Points']=0
        df.loc[random_win, 'Random Points'] = 3
        df.loc[random_tie, 'Random Points'] = 1
        df.loc[random_loss, 'Random Points'] = 0
    return df
    

In [None]:
def season_and_teams(season,dataframe):
    ## Funtion that takes a season slice from original_df and applies results_and_points to every team from that season. 
    ## Returns a dictionary
    season_slice = dataframe.loc[dataframe['Season']==season]
    unique_teams = sorted(season_slice.HomeTeam.unique())
    dict_title={}
    for team in unique_teams:
        dict_title[team] = results_and_points(season_slice,team)
    return dict_title

The function cumulative_points takes the matches dataframe and a single season as input and returns a line plot of the top, bottom and 10th place team's cumulative real and cumulative randomized points for comparison. 

In [None]:
def cumulative_points(df,season):
    team_list = df.loc[season]['HomeTeam'].unique()
    most_least_list = []
    most_least_dict = {}
    for team in team_list:
        last_year = df.loc[(season,team)]
        most_least_list.append(last_year['Points'].sum())
        most_least_dict[last_year['Points'].sum()] = team
    most_least_list = sorted(most_least_list)
    best_team = df.loc[(season,most_least_dict[most_least_list[-1]])]
    mid_team = df.loc[(season,most_least_dict[most_least_list[9]])]
    worst_team = df.loc[(season,most_least_dict[most_least_list[0]])]
    best_team = best_team.reset_index(drop=True)
    mid_team = mid_team.reset_index(drop=True)
    worst_team = worst_team.reset_index(drop=True)
    best_cum_points = best_team['Points'].cumsum()
    best_cum_randoms = best_team['Random Points'].cumsum()
    mid_cum_points = mid_team['Points'].cumsum()
    mid_cum_randoms = mid_team['Random Points'].cumsum()
    worst_cum_points = worst_team['Points'].cumsum()
    worst_cum_randoms = worst_team['Random Points'].cumsum()
    best_cum_points.plot(color='blue',label='Top Team Real Results')
    best_cum_randoms.plot(color='red',label='Top Team Random Results')
    mid_cum_points.plot(color='green', label = 'Mid Table Team Real Results')
    mid_cum_randoms.plot(color='yellow', label='Mid Table Team Random Results')
    worst_cum_points.plot(color='brown',label='Bottom Team Real Results')
    worst_cum_randoms.plot(color='black',label='Bottom Team Random Results')
    plt.xlabel('Games')
    plt.ylabel('Points')
    plt.title("Point Total from " + season + ': Expected vs. Actual')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

The function below produces a bar plot of every team's mean PPG home and away for a given season. 

In [None]:
def home_away_points(df, season):
    team_list = df.loc[season]['HomeTeam'].unique()
    home_points= []
    away_points = []
    for team in team_list:
        team_df = df.loc[(season,team)]
        home = team_df['HomeTeam'] == team
        away = team_df['AwayTeam'] == team
        home_points.append(team_df.loc[home,'Points'].mean().round(2))
        away_points.append(team_df.loc[away, 'Points'].mean().round(2))
    dataframe = pd.DataFrame({'Teams':team_list,'Mean Home Points':home_points,'Mean Away Points' : away_points})
    dataframe.set_index('Teams', inplace=True)
    dataframe.plot(kind='Bar',title='Mean Home and Away Points in ' + season)

In [2]:
def all_time_win_ratio(df):
    team_list = df.HomeTeam.unique()
    all_time_ratio = {}
    for team in team_list:
        all_games = df.loc[(slice(None),team),:]
        win_ratio = len(all_games[all_games['Points'] == 3])/len(all_games)
        all_time_ratio[team] = win_ratio
    df['opp_win_ratio'] = df['opponent'].map(all_time_ratio)
    return df

In [12]:
def win_ratio_head2head(df):
    team_list = df.HomeTeam.unique()
    teamshead2head = {}
    for team in team_list:
        all_games = df.loc[(slice(None),team),:]
        head2head = {}
        minus_team = [x for x in team_list if x != team]
        for opponent in minus_team:
            opponentwins = (all_games['opponent']==opponent) & (all_games['Points'] == 3)
            opponenttotal = (all_games['opponent'] == opponent)
            if len(all_games[opponentwins]) > 0:
                win_ratio = len(all_games[opponentwins])/len(all_games[opponenttotal])
            else:
                win_ratio = 0
            head2head[opponent] = win_ratio
        all_games['head2head_ratio'] = all_games['opponent'].map(head2head)
        df.loc[(slice(None),team),:] = all_games
    return all_games

In [103]:
#seasons = seasons.drop(['W','L','D','F','A'],axis=1)
by_year = seasons.groupby(['year'])
for key,group in by_year:
    season = {}
    df = pd.DataFrame(group)
    double_df.loc[key,('opponent'== df.iloc[0,1])]['Position'] = 12

KeyError: 'the label [False] is not in the [columns]'

In [100]:
print(df)

       year                  Team  Pts  Pos
0   2016-17               Chelsea   93    1
1   2016-17             Tottenham   86    2
2   2016-17              Man City   78    3
3   2016-17             Liverpool   76    4
4   2016-17               Arsenal   75    5
5   2016-17            Man United   69    6
6   2016-17               Everton   61    7
7   2016-17           Southampton   46    8
8   2016-17           Bournemouth   46    9
9   2016-17  West Bromwich Albion   45   10
10  2016-17       West Ham United   45   11
11  2016-17        Leicester City   44   12
12  2016-17            Stoke City   44   13
13  2016-17        Crystal Palace   41   14
14  2016-17          Swansea City   41   15
15  2016-17               Burnley   40   16
16  2016-17               Watford   40   17
17  2016-17             Hull City   34   18
18  2016-17         Middlesbrough   28   19
19  2016-17            Sunderland   24   20


In [83]:
# Two sample ttest for difference in mean points home and away across all seasons. Statistically significant.
arsenal = double_df.loc[(slice(None),'Arsenal'),:]
home = arsenal['HomeTeam'] == 'Arsenal'
away = arsenal['AwayTeam'] == 'Arsenal'
x = arsenal[home]['Points']
y = arsenal[away]['Points']
t,p = stats.ttest_ind(x,y)
print(t,p)


7.426523757865245 2.466284294024182e-13


In [None]:
bottom_teams = seasons[(seasons['Pos'] == 15) | (seasons['Pos'] == 16) | (seasons['Pos'] == 17 )]
top_teams = seasons[(seasons['Pos'] == 1) | (seasons['Pos'] == 2) | (seasons['Pos'] == 3 )]
top = list(top_teams.loc['2010-11']['Team'])
bottom = list(bottom_teams.loc['2010-11']['Team'])
for team in top:
    idx = pd.IndexSlice
    points_collected = []
    points_against_bottom = {}
    team_games = double_df.loc[idx['2011-12',team],:]
    points_collected.append(team_games[team_games['opponent'] == bottom[0]]['Points'].sum())
    points_collected.append(team_games[team_games['opponent'] == bottom[1]]['Points'].sum())
    points_collected.append(team_games[team_games['opponent'] == bottom[2]]['Points'].sum())
    points_against_bottom[team] = np.sum(points_collected)

print(top, points_against_bottom)

In [None]:
arsenal_model = arsenal.drop(['HomeTeam','AwayTeam', 'Random Points','randomResult','FTR'],axis=1)
model_df = pd.get_dummies(arsenal_model, prefix = ['opp'], columns = ['opponent'])
x = model_df.drop(['Points'], axis=1)
y = model_df['Points']
x_train,x_test,y_train,y_test = train_test_split(x,y)
logreg = LogisticRegression(solver = 'lbfgs')
logreg.fit(x_train,y_train)
train_score = logreg.score(x_train, y_train)
test_score = logreg.score(x_test, y_test)
print('Train score is: ', train_score)
print('Test score is: ', test_score)

In [None]:
arsenal['Home/Away'] = 1
arsenal.loc[arsenal['AwayTeam'] == 'Arsenal', 'Home/Away'] = 0
arsenal_model1 = arsenal.drop(['HomeTeam','AwayTeam', 'Random Points','randomResult','FTR'], axis=1)
model_df1 = pd.get_dummies(arsenal_model1, prefix = ['opp'], columns = ['opponent'])
x1 = model_df1.drop(['Points'], axis=1)
y1 = model_df1['Points']
x_train1,x_test1,y_train1,y_test1 = train_test_split(x1,y1)
logreg1 = LogisticRegression(solver = 'lbfgs')
logreg1.fit(x_train1,y_train1)
train_score1 = logreg1.score(x_train1, y_train1)
test_score1 = logreg1.score(x_test1, y_test1)
print('Train score is: ', train_score1)
print('Test score is: ', test_score1)

In [None]:
forest = RandomForestRegressor(max_depth=25, min_samples_split=15,n_estimators=1000,random_state=1)
forest.fit(x_train1, y_train1)