In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from wrangle import epl_aq_all
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import regex as re

from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

In [2]:
from preprocessing import preprocessing_script

# Project Goals
1. Create a regression of point differential in predicting future points
2. Analyze the importance of draws in pythagorean expectations predictions
3. Predict by placement (classification) especially on relegation side??

# Wrangle Data
Three functions acquiring and prepping data are all available in wrangle.py file.
- 1. `epl_year_aq`: pulls in data frame from season
- 2. `strip_team_name`: takes in season csv and splits and cleans team_name_column 
- 3. `epl_aq_all`: creates df of all seasons of epl
- 4. `rename columns`: renamess columns explicitly
- 5. `make_number_of_seasons`: makes a column for number of seasons

In [3]:
df = epl_aq_all()

In [4]:
print(df.shape)
df.info()

(340, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 340 entries, 0 to 339
Data columns (total 12 columns):
team_name            340 non-null object
games_played         340 non-null int64
wins                 340 non-null int64
draws                340 non-null int64
losses               340 non-null int64
goals_for            340 non-null int64
goals_against        340 non-null int64
goal_differential    340 non-null int64
points               340 non-null int64
year                 340 non-null int64
place                340 non-null int64
seasons_in_epl       340 non-null int64
dtypes: int64(11), object(1)
memory usage: 32.0+ KB


In [5]:
df = preprocessing_script(df)

In [9]:
df['goal_percentage'] = df.goals_for / (df.goals_for+df.goals_against)

In [12]:
df['win_rate'] = df.wins / df.losses

In [16]:
df['surpluss_goals'] = df.goal_differential - df.wins

In [22]:
df['points_rate'] = (df.wins + df.draws) / df.losses

In [59]:
df['goals_per_loss'] = df.goals_for/ (df.losses)

In [60]:
(df[['points', 'goal_differential', 'next_season_points', 
     'wins', 'goals_for', 'goals_against', 'place', 'losses', 
     'goal_percentage', 'win_rate', 'surpluss_goals', 'points_rate',
    'goals_per_loss', 'defense', 'seasons_in_epl']]
     .dropna().corr().next_season_points)

points                0.784466
goal_differential     0.808551
next_season_points    1.000000
wins                  0.760709
goals_for             0.752497
goals_against        -0.699766
place                -0.726295
losses               -0.770751
goal_percentage       0.816789
win_rate              0.571873
surpluss_goals        0.805589
points_rate           0.567617
goals_per_loss        0.604209
defense               0.535040
seasons_in_epl        0.638860
Name: next_season_points, dtype: float64

#### Overview of the dataframe

In [None]:
print(df.shape)
print(df.info())
df.head(2)

In [None]:
df.describe()

# Explore

### Team Results

In [None]:
df.set_index('year').groupby('team_name').points.plot()
plt.title('Team Points by Year')
plt.show()

**Takeaways** At the top of the viz, you can see that there are a couple teams that have stayed at the top of the table, but as you get further towards the middle, the data becomes a lot more muddled

In [None]:
df.points.hist(bins=5)
plt.title("Distribution of Points by Team-Season")
plt.show()

**Takeaways** Looks like a poisson distribution.

In [None]:
df.draws.hist(bins=5)
plt.title("Distribution of Draws by Team-Season")
plt.show()

**Takeaways** Normal distribution around 9-10 draws per season

In [None]:
df.draws.describe()

Average number of draws per year is 9.5

### Exploring Draws a little more closely

In [None]:
df.groupby('place').draws.mean().plot.bar(color='green')
plt.title('Is There a Pattern for Draws by Table Place?')

**Takeaways** Number of draws tend to be higher at the middle of the table

In [None]:
df.groupby('year').draws.mean().plot.bar(color='red')
plt.title('Is There a Pattern of Draws by Year?')
plt.show()

**Takeaways** Not really, there is a pretty significant difference between certain years, but no noticeable trends
**Furtherwork** Some hypothesis testing and statistical analysis

In [None]:
plt.figure(figsize=(16,5))
df.groupby('team_name').draws.mean().plot.bar(color='purple')
plt.title('Is There a Pattern of Draws by Team?')
plt.show()

**Takeaways** Likely, but not immediately evident.

In [None]:
plt.figure(figsize=(16,5))
df.groupby('team_name').draws.count().sort_values(ascending=False).plot.bar(color='orange')
plt.title('Because of Relegation, How Many Seasons Has Each Team Played?')

### Just for fun , lets look at scatter plots of number of seasons and average number of points and draws!

In [None]:
df_temp = df.groupby('team_name').draws.agg(['count', 'mean'])
sns.scatterplot(x='count', y='mean', data=df_temp)
plt.xlabel('Number of Seasons')
plt.ylabel('Average Number of Draws')
plt.title('What is the Relationship between No of Seasons in EPL and Avg No of Draws')

In [None]:
df_temp = df.groupby('team_name').points.agg(['count', 'mean'])
sns.lmplot(x='count', y='mean', data=df_temp)
plt.xlabel('Number of Seasons')
plt.ylabel('Average Number of Points')
plt.title('What is the Relationship between No of Seasons in EPL and Avg Points')

**Takeaways:** Seems to be a pretty clear linear relationship between number of seasons in the Premier League and the Average number of Points

In [None]:
sns.lmplot(x='points', y = 'goal_differential', data = df)
plt.title('What is the relationship between points and goal differential')

In [None]:
sns.lmplot(x='place', y = 'goal_differential', data = df)
plt.title('What is the relationship between points and place')

### Is it better to win or avoid a loss?

In [None]:
df[['wins', 'losses', 'points']].corr().points[0:2].abs().plot.bar(color='indigo')
plt.title("Absolute value of Points to Wins and Losses")
plt.ylabel('Absolute value of correlation')

**Takeaways:** Although there is a high absolute correlation to both.  It appears that winning is more important than avoiding losses.

In [None]:
df[df.place <= 4][['wins', 'losses', 'points']].corr().points[0:2].abs().plot.bar(color='indigo')
plt.title("Absolute value of Points to Wins and Losses for Champions League Places")
plt.ylabel('Absolute value of correlation')

In [None]:
df[df.place >= 18][['wins', 'losses', 'points']].corr().points[0:2].abs().plot.bar(color='indigo')
plt.title("Absolute value of Points to Wins and Losses for Champions League Places")
plt.ylabel('Absolute value of correlation')

# Modeling

In [None]:
df_pred = df[(df.year - (df.next_year - 1)) == 0]

In [None]:
print(df_pred.shape)
df_pred.info()

In [None]:
df_pred

In [None]:
df.dropna(inplace=True)

In [None]:
X = df[['wins', 'draws', 'losses', 'goals_for', 'goals_against','goal_differential', 'points']]
y = df[['next_season_points']]

In [None]:
lm = LinearRegression()

In [None]:
rfe = RFE(lm, 2)

In [None]:
X_rfe = rfe.fit_transform(X, y)  

In [None]:
mask = rfe.support_

In [None]:
# select the column names of the features that were selected and convert them to a list for future use. 
rfe_features = X.columns[mask]

# print them out here for our reference
print(f'selected {len(rfe_features)} features:', ', '.join(rfe_features))

In [None]:
print("Linear Model:", lm)

print("intercept: ", lm.intercept_)

print("features: ", rfe_features)
print("coefficients: ", lm.coef_)

In [None]:
class statsmodels.regression.linear_model.OLS(endog, exog=None, missing='none', hasconst=None, **kwargs)[source]¶

In [None]:
for i in range(320):
    print(

In [None]:
pd.read_csv>

In [None]:
for i in range(320):
    if df.team_name[i] == 'KeyError':
        print(df.team_name[i])