In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%config InlineBackend.figure_format = 'svg'

In [None]:
data = pd.read_csv('ShortTermPred.csv')

In [None]:
##Set index to datetime
data['0']= pd.to_datetime(data['0'], format='%Y-%m-%d')
data.set_index(data['0'],inplace=True)
data.drop('0',axis=1,inplace=True)
data.head()

In [None]:
data.resample('Y')['Attendance'].mean().plot.bar(figsize = (12,5),title='Attendance by year')


In [None]:
sns.distplot(data['Attendance']);

In [None]:
## For categorical variables, count 
## For numberical variables, histogram

fig = plt.figure(figsize=(12, 15))
cols = 3
rows = np.ceil(float(data.shape[1]) / cols)
for i, column in enumerate(data.columns):
    ax = fig.add_subplot(rows, cols, i + 1)
    ax.set_title(column)
    if data.dtypes[column] == np.object:
        data[column].value_counts().plot(kind="bar", axes=ax)
    else:
        data[column].hist(axes=ax)
        plt.xticks(rotation="vertical")
plt.subplots_adjust(hspace=0.7, wspace=0.2)

In [None]:
data['Attendance'].plot(figsize = (12,5),title='Attendance over time')

In [None]:
## Attendance over time for each year
fig = plt.figure(figsize=(12, 60))
cols = 2
rows = (len(data.index.year.unique()) / cols)+1
for i, year in enumerate(data.index.year.unique()):
    ax = fig.add_subplot(rows, cols, i + 1)
    ax.set_title(year)
    data[data.index.year == year]['Attendance'].plot()

In [None]:
## Correlation matrix with all numerical variables
numerical = ['Attendance','GameNumber','WinLossRatio','WinLossRatioLast10','max_temperature','precipitation','year','month']
corr_matrix = data[numerical].corr()
fig=plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix, annot = True);

In [None]:
##Holidays graphs
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1, 1, 1)
sns.boxplot(x='Holidays',y='Attendance',data=data).set_title('Holidays')

In [None]:
## How attendance is related to the categorical variables
fig = plt.figure(figsize=(12,15))
ax = fig.add_subplot(2, 2, 1)
sns.boxplot(x='DayOfWeek',y='Attendance',data=data).set_title('Day of the week')
ax = fig.add_subplot(2, 2, 2)
sns.boxplot(x='DayNight',y='Attendance',data=data).set_title('Day or night game')
ax = fig.add_subplot(2, 2, 3)
sns.boxplot(x='HomeOpener',y='Attendance',data=data).set_title('Home Opener')
ax = fig.add_subplot(2, 2, 4)
sns.boxplot(x='Soccergame',y='Attendance',data=data).set_title('Soccer game')

In [None]:
## Attendance by opponent
fig = plt.figure(figsize=(12,5))
sns.boxplot(x='VisitingTeam_Team',y='Attendance',data=data,).set_title('Opponent')

In [None]:
## Attendance by game number and WinLossRatio
corr_matrix = data[['Attendance','GameNumber','GamesBack','WinLossRatio','WinLossRatioLast10','yankees_WinLossRatio','orioles_WinLossRatio','redsox_WinLossRatio']].corr()
fig=plt.figure(figsize=(8, 8))
sns.heatmap(corr_matrix, annot = True);