In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from wrangle import epl_aq_all
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import regex as re

# Project Goals
1. Create a regression of point differential in predicting future points
2. Analyze the importance of draws in pythagorean expectations predictions
3. Predict by placement (classification) especially on relegation side??

# Wrangle Data
Three functions acquiring and prepping data are all available in wrangle.py file.
- 1. `epl_year_aq`: pulls in data frame from season
- 2. `strip_team_name`: takes in season csv and splits and cleans team_name_column 
- 3. `epl_aq_all`: creates df of all seasons of epl
- 4. `rename columns`: renamess columns explicitly
- 5. `make_number_of_seasons`: makes a column for number of seasons

In [2]:
df = epl_aq_all()

In [3]:
# df.to_csv('epl_years.csv')

In [7]:
df = df.reset_index()

In [14]:
# get last seasons points
df[df.year == (df.year.shift(1) + 1)]

Unnamed: 0,team_name,games_played,wins,draws,losses,goals_for,goals_against,goal_differential,points,year,place,seasons_in_epl
1,AFC Bournemouth,38,12,10,16,55,67,-12,46,2016,9,4
2,AFC Bournemouth,38,11,11,16,45,61,-16,44,2017,12,4
3,AFC Bournemouth,38,13,6,19,56,70,-14,45,2018,14,4
5,Arsenal,38,26,12,0,73,26,47,90,2003,1,17
6,Arsenal,38,25,8,5,87,36,51,83,2004,2,17
...,...,...,...,...,...,...,...,...,...,...,...,...
335,Wigan Athletic,38,12,9,17,34,45,-11,45,2008,11,8
336,Wigan Athletic,38,9,9,20,37,79,-42,36,2009,16,8
337,Wigan Athletic,38,9,15,14,40,61,-21,42,2010,16,8
338,Wigan Athletic,38,11,10,17,42,62,-20,43,2011,15,8


In [None]:
# add to wrangle file
def make_number_seasons(df):
    """makes column for number of seasons a team is in the epl"""
    seasons_series = df.groupby('team_name').draws.count()
    df2 = pd.DataFrame(seasons_series)
    df2 = df2.rename(columns={'draws':'seasons_in_epl'})
    df = df.set_index('team_name').join(df2)
    return df

In [None]:
df = make_number_seasons(df

#### Overview of the dataframe

In [None]:
print(df.shape)
print(df.info())
df.head(2)

In [None]:
df.describe()

# Explore

### Team Results

In [None]:
df.groupby('team_name').points.plot()
plt.title('Team Points by Year')
plt.show()

**Takeaways** At the top of the viz, you can see that there are a couple teams that have stayed at the top of the table, but as you get further towards the middle, the data becomes a lot more muddled

In [None]:
df.points.hist(bins=5)
plt.title("Distribution of Points by Team-Season")
plt.show()

**Takeaways** Looks like a poisson distribution.

In [None]:
df.draws.hist(bins=5)
plt.title("Distribution of Draws by Team-Season")
plt.show()

**Takeaways** Normal distribution around 9-10 draws per season

In [None]:
df.draws.describe()

Average number of draws per year is 9.5

### Exploring Draws a little more closely

In [None]:
df.groupby('place').draws.mean().plot.bar(color='green')
plt.title('Is There a Pattern for Draws by Table Place?')

**Takeaways** Number of draws tend to be higher at the middle of the table

In [None]:
df.groupby('year').draws.mean().plot.bar(color='red')
plt.title('Is There a Pattern of Draws by Year?')
plt.show()

**Takeaways** Not really, there is a pretty significant difference between certain years, but no noticeable trends
**Furtherwork** Some hypothesis testing and statistical analysis

In [None]:
plt.figure(figsize=(16,5))
df.groupby('team_name').draws.mean().plot.bar(color='purple')
plt.title('Is There a Pattern of Draws by Team?')
plt.show()

**Takeaways** Likely, but not immediately evident.

In [None]:
plt.figure(figsize=(16,5))
df.groupby('team_name').draws.count().sort_values(ascending=False).plot.bar(color='orange')
plt.title('Because of Relegation, How Many Seasons Has Each Team Played?')

### Just for fun , lets look at scatter plots of number of seasons and average number of points and draws!

In [None]:
df_temp = df.groupby('team_name').draws.agg(['count', 'mean'])
sns.scatterplot(x='count', y='mean', data=df_temp)
plt.xlabel('Number of Seasons')
plt.ylabel('Average Number of Draws')
plt.title('What is the Relationship between No of Seasons in EPL and Avg No of Draws')

In [None]:
df_temp = df.groupby('team_name').points.agg(['count', 'mean'])
sns.lmplot(x='count', y='mean', data=df_temp)
plt.xlabel('Number of Seasons')
plt.ylabel('Average Number of Points')
plt.title('What is the Relationship between No of Seasons in EPL and Avg Points')

**Takeaways:** Seems to be a pretty clear linear relationship between number of seasons in the Premier League and the Average number of Points

In [None]:
df

In [None]:
sns.lmplot(x='points', y = 'goal_differential', data = df)
plt.title('What is the relationship between points and goal differential')

In [None]:
sns.lmplot(x='place', y = 'goal_differential', data = df)
plt.title('What is the relationship between points and place')

In [None]:
def assessing_post_season(df):
    if df.place <= 4:
        df['post_season'] = 'champions_league'
    elif df.place <= 8:
        df['post_season'] = 'europa_league'
    elif df.place <= 14:
        df['post_season'] = 'mid_table'
    elif df.place <= 17:
        df['post_season'] = 'relegation_battle'
    else:
        df['post_season'] = 'relegation'
    return df

In [None]:
assessing_post_season(df)