In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from wrangle import epl_aq_all
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import regex as re

from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [2]:
from preprocessing import preprocessing_script

In [3]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

# Project Goals
1. Create a regression of point differential in predicting future points
2. Analyze the importance of draws in pythagorean expectations predictions
3. Predict by placement (classification) especially on relegation side??

# Wrangle Data
Three functions acquiring and prepping data are all available in wrangle.py file.
- 1. `epl_year_aq`: pulls in data frame from season
- 2. `strip_team_name`: takes in season csv and splits and cleans team_name_column 
- 3. `epl_aq_all`: creates df of all seasons of epl
- 4. `rename columns`: renamess columns explicitly
- 5. `make_number_of_seasons`: makes a column for number of seasons

In [None]:
df = epl_aq_all()

In [None]:
print(df.shape)
df.info()

In [None]:
df = preprocessing_script(df)

In [None]:
df

In [None]:
df['next_year'] = df.year.shift(-1)

In [None]:
(df[['points', 'goal_differential', 'next_season_points', 
     'wins', 'goals_for', 'goals_against', 'place', 'losses', 
     'goal_percentage', 'win_rate', 'surpluss_goals', 'points_rate',
    'goals_per_loss', 'seasons_in_epl']]
     .dropna().corr().next_season_points)

#### Overview of the dataframe

In [None]:
print(df.shape)
print(df.info())
df.head(2)

In [None]:
df.describe()

# Explore

### Team Results

In [None]:
df.set_index('year').groupby('team_name').points.plot()
plt.title('Team Points by Year')
plt.show()

**Takeaways** At the top of the viz, you can see that there are a couple teams that have stayed at the top of the table, but as you get further towards the middle, the data becomes a lot more muddled

In [None]:
df.points.hist(bins=5)
plt.title("Distribution of Points by Team-Season")
plt.show()

**Takeaways** Looks like a poisson distribution.

In [None]:
df.draws.hist(bins=5)
plt.title("Distribution of Draws by Team-Season")
plt.show()

**Takeaways** Normal distribution around 9-10 draws per season

In [None]:
df.draws.describe()

Average number of draws per year is 9.5

### Exploring Draws a little more closely

In [None]:
df.groupby('place').draws.mean().plot.bar(color='green')
plt.title('Is There a Pattern for Draws by Table Place?')

**Takeaways** Number of draws tend to be higher at the middle of the table

In [None]:
df.groupby('year').draws.mean().plot.bar(color='red')
plt.title('Is There a Pattern of Draws by Year?')
plt.show()

**Takeaways** Not really, there is a pretty significant difference between certain years, but no noticeable trends
**Furtherwork** Some hypothesis testing and statistical analysis

In [None]:
plt.figure(figsize=(16,5))
df.groupby('team_name').draws.mean().plot.bar(color='purple')
plt.title('Is There a Pattern of Draws by Team?')
plt.show()

**Takeaways** Likely, but not immediately evident.

In [None]:
plt.figure(figsize=(16,5))
df.groupby('team_name').draws.count().sort_values(ascending=False).plot.bar(color='orange')
plt.title('Because of Relegation, How Many Seasons Has Each Team Played?')

### Just for fun , lets look at scatter plots of number of seasons and average number of points and draws!

In [None]:
df_temp = df.groupby('team_name').draws.agg(['count', 'mean'])
sns.scatterplot(x='count', y='mean', data=df_temp)
plt.xlabel('Number of Seasons')
plt.ylabel('Average Number of Draws')
plt.title('What is the Relationship between No of Seasons in EPL and Avg No of Draws')

In [None]:
df_temp = df.groupby('team_name').points.agg(['count', 'mean'])
sns.lmplot(x='count', y='mean', data=df_temp)
plt.xlabel('Number of Seasons')
plt.ylabel('Average Number of Points')
plt.title('What is the Relationship between No of Seasons in EPL and Avg Points')

**Takeaways:** Seems to be a pretty clear linear relationship between number of seasons in the Premier League and the Average number of Points

In [None]:
sns.lmplot(x='points', y = 'goal_differential', data = df)
plt.title('What is the relationship between points and goal differential')

In [None]:
sns.lmplot(x='place', y = 'goal_differential', data = df)
plt.title('What is the relationship between points and place')

### Is it better to win or avoid a loss?

In [None]:
df[['wins', 'losses', 'points']].corr().points[0:2].abs().plot.bar(color='indigo')
plt.title("Absolute value of Points to Wins and Losses")
plt.ylabel('Absolute value of correlation')

**Takeaways:** Although there is a high absolute correlation to both.  It appears that winning is more important than avoiding losses.

In [None]:
df[df.place <= 4][['wins', 'losses', 'points']].corr().points[0:2].abs().plot.bar(color='indigo')
plt.title("Absolute value of Points to Wins and Losses for Champions League Places")
plt.ylabel('Absolute value of correlation')

In [None]:
df[df.place >= 18][['wins', 'losses', 'points']].corr().points[0:2].abs().plot.bar(color='indigo')
plt.title("Absolute value of Points to Wins and Losses for Champions League Places")
plt.ylabel('Absolute value of correlation')

# Modeling

In [None]:
df.head()

In [None]:



df_pred = df[(df.year - (df.next_year - 1)) == 0]

In [None]:
print(df_pred.shape)
df_pred.info()

In [None]:
predictions = pd.DataFrame({
    'yhat': df_pred['next_season_points']
})

In [None]:
lm = LinearRegression()

In [None]:
X = df_pred[['points']]
y = df_pred[['next_season_points']]

In [None]:
lm.fit(X, y)

In [148]:
predictions['pred_points'] = lm.predict(y)

In [149]:
predictions

Unnamed: 0,yhat,pred_points
0,46.0,46.089526
1,44.0,44.349816
2,45.0,45.219671
4,90.0,84.363134
5,83.0,78.274151
...,...,...
331,42.0,42.610107
332,43.0,43.479962
333,36.0,37.390979
336,40.0,40.870398


In [150]:
lm.score(X, y)

0.6062767347880587

In [175]:
features_for_modeling = df_pred.drop(columns=['team_name', 'games_played', 'year', 'next_season_points', 'finish', 'win_rate', 'points_rate', 'goals_per_loss', 'next_year']).columns

In [176]:
features_for_modeling

Index(['wins', 'draws', 'losses', 'goals_for', 'goals_against',
       'goal_differential', 'points', 'place', 'seasons_in_epl',
       'goal_percentage', 'surpluss_goals', 'yhat'],
      dtype='object')

In [177]:
for i in features_for_modeling:
    X = df_pred[[i]]
    y = df_pred[['next_season_points']]
    lm.fit(X, y)
    predictions[f'pred_{i}'] = lm.predict(X)

In [178]:
predictions

Unnamed: 0,yhat,pred_points,pred_wins,pred_draws,pred_losses,pred_goals_for,pred_goals_against,pred_goal_differential,pred_place,pred_seasons_in_epl,pred_goal_percentage,pred_surpluss_goals,pred_yhat
0,46.0,42.610107,44.314731,55.338619,41.041964,47.795041,33.642099,38.412073,36.841452,32.597701,39.393851,37.298893,46.0
1,44.0,46.089526,46.670355,54.387229,46.417631,56.291642,33.642099,44.339175,54.698529,32.597701,45.568149,44.086831,44.0
2,45.0,44.349816,44.314731,53.435839,46.417631,47.795041,40.170232,41.968335,47.045496,32.597701,42.257571,41.824185,45.0
4,90.0,73.924877,72.582218,55.338619,73.295966,81.781445,60.842655,76.938236,72.555607,65.118937,73.077840,77.272304,90.0
5,83.0,84.363134,79.649090,52.484450,89.422967,71.585524,78.251011,79.309077,75.106618,65.118937,81.650695,78.026519,83.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,42.0,37.390979,39.603484,55.338619,35.666297,40.997761,20.585831,26.557869,36.841452,42.604235,28.965231,23.723018,42.0
332,43.0,42.610107,39.603484,49.630280,51.793298,43.546741,40.170232,39.004784,36.841452,42.604235,38.670308,39.561539,43.0
333,36.0,43.479962,44.314731,54.387229,43.729797,45.246061,39.082210,39.597494,39.392463,42.604235,39.653299,38.807324,36.0
336,40.0,39.130688,39.603484,53.435839,41.041964,36.749460,45.610343,37.226653,39.392463,35.099334,34.590130,37.298893,40.0


In [181]:
prediction_features = predictions.drop(columns=('yhat')).columns

In [182]:
#determining rmse
score = []
for i in prediction_features:
    name = i
    rmse = ((predictions.yhat - predictions[i])**2).mean()**.5
    score.append(rmse)

In [183]:
pd.DataFrame({
    'feature': list(prediction_features),
    'rmse': score
})

Unnamed: 0,feature,rmse
0,pred_points,10.81966
1,pred_wins,11.34833
2,pred_draws,17.03742
3,pred_losses,11.17788
4,pred_goals_for,11.70327
5,pred_goals_against,12.2189
6,pred_goal_differential,10.23833
7,pred_place,11.88042
8,pred_seasons_in_epl,13.30266
9,pred_goal_percentage,10.0779


## Cross Validation

In [185]:
import sklearn.metrics as m
from sklearn.model_selection import cross_val_score

In [197]:
for i in features_for_modeling:
    X = df_pred[i]
    y = df_pred['points']
    print(f'{i} is {cross_val_score(lm, X, y, cv=2)}')

wins is [nan nan]
draws is [nan nan]
losses is [nan nan]
goals_for is [nan nan]
goals_against is [nan nan]
goal_differential is [nan nan]
points is [nan nan]
place is [nan nan]
seasons_in_epl is [nan nan]
goal_percentage is [nan nan]
surpluss_goals is [nan nan]
yhat is [nan nan]


In [190]:
df_pred

Unnamed: 0,team_name,games_played,wins,draws,losses,goals_for,goals_against,goal_differential,points,year,...,seasons_in_epl,finish,next_season_points,goal_percentage,win_rate,surpluss_goals,points_rate,goals_per_loss,next_year,yhat
0,AFC Bournemouth,38,11,9,18,45,67,-22,42,2015,...,4,relegation_battle,46.0,0.401786,0.611111,-33,1.111111,2.500000,2016.0,46.0
1,AFC Bournemouth,38,12,10,16,55,67,-12,46,2016,...,4,mid_table,44.0,0.450820,0.750000,-24,1.375000,3.437500,2017.0,44.0
2,AFC Bournemouth,38,11,11,16,45,61,-16,44,2017,...,4,mid_table,45.0,0.424528,0.687500,-27,1.375000,2.812500,2018.0,45.0
4,Arsenal,38,23,9,6,85,42,43,78,2002,...,17,champions_league,90.0,0.669291,3.833333,20,5.333333,14.166667,2003.0,90.0
5,Arsenal,38,26,12,0,73,26,47,90,2003,...,17,champions_league,83.0,0.737374,inf,21,inf,inf,2004.0,83.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,Wigan Athletic,38,9,9,20,37,79,-42,36,2009,...,8,relegation_battle,42.0,0.318966,0.450000,-51,0.900000,1.850000,2010.0,42.0
332,Wigan Athletic,38,9,15,14,40,61,-21,42,2010,...,8,relegation_battle,43.0,0.396040,0.642857,-30,1.714286,2.857143,2011.0,43.0
333,Wigan Athletic,38,11,10,17,42,62,-20,43,2011,...,8,relegation_battle,36.0,0.403846,0.647059,-31,1.235294,2.470588,2012.0,36.0
336,Wolverhampton Wanderers,38,9,11,18,32,56,-24,38,2009,...,5,relegation_battle,40.0,0.363636,0.500000,-33,1.111111,1.777778,2010.0,40.0
