In [1]:
import pandas as pd
import numpy as np
from pymongo import MongoClient

In [2]:
df = pd.read_csv('../data/nfl_plays.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.columns

Index([u'GameId', u'GameDate', u'Quarter', u'Minute', u'Second',
       u'OffenseTeam', u'DefenseTeam', u'Down', u'ToGo', u'YardLine',
       u'Unnamed: 10', u'SeriesFirstDown', u'Unnamed: 12', u'NextScore',
       u'Description', u'TeamWin', u'Unnamed: 16', u'Unnamed: 17',
       u'SeasonYear', u'Yards', u'Formation', u'PlayType', u'IsRush',
       u'IsPass', u'IsIncomplete', u'IsTouchdown', u'PassType', u'IsSack',
       u'IsChallenge', u'IsChallengeReversed', u'Challenger', u'IsMeasurement',
       u'IsInterception', u'IsFumble', u'IsPenalty', u'IsTwoPointConversion',
       u'IsTwoPointConversionSuccessful', u'RushDirection', u'YardLineFixed',
       u'YardLineDirection', u'IsPenaltyAccepted', u'PenaltyTeam', u'IsNoPlay',
       u'PenaltyType', u'PenaltyYards', u'Unnamed: 45'],
      dtype='object')

In [4]:
# remove unnamed columns. Most are empty seperators
df = df.drop([u'Unnamed: 10', u'Unnamed: 12', u'Unnamed: 16', u'Unnamed: 17', 
              u'Unnamed: 45','Challenger'], axis=1)

In [5]:
# turn gamedate into datetimes
df['GameDate'] = df['GameDate'].apply(pd.to_datetime)

In [6]:
# replace null values of 'SeasonYear' with 2013
mask = df['SeasonYear'].isnull()
df.loc[mask, 'SeasonYear'] = 2013.0

mask = df['SeasonYear'] == 0
df.loc[mask, 'SeasonYear'] = 2015.0

In [7]:
# replace null values of 'Yards' with 0.0
mask = df['Yards'].isnull()
df.loc[mask, 'Yards'] = 0.0

In [8]:
# replace null values of 'PlayType' with 'RUSH', they are wildcat plays
mask = df['PlayType'].isnull()
df.loc[mask, 'PlayType'] = 'RUSH'

In [9]:
# replace null values of 'IsPass' with 0
mask = df['IsPass'].isnull()
df.loc[mask, 'IsPass'] = 0

In [10]:
# replace null values of 'IsTouchdown' with 0
mask = df['IsTouchdown'].isnull()
df.loc[mask, 'IsTouchdown'] = 0

In [11]:
# replace null values of 'PassType' with 0, also kicks were being missclassified as PASS
mask = df['PlayType'] != 'PASS'
df.loc[mask, 'PassType'] = 0

mask = (df['PlayType'] == 'PASS') & df['Description'].str.contains('KICKS')
df.loc[mask, 'PassType'] = 0
df.loc[mask, 'PlayType'] = 'PUNT'

mask = (df['PlayType'] == 'PASS') & df['Description'].str.contains('FIELD GOAL')
df.loc[mask, 'PassType'] = 0
df.loc[mask, 'PlayType'] = 'FIELD GOAL'

mask = (df['PlayType'] == 'PASS') & df['Description'].str.contains('EXTRA POINT')
df.loc[mask, 'PassType'] = 0
df.loc[mask, 'PlayType'] = 'EXTRA POINT'

mask = (df['PlayType'] == 'PASS') & df['Description'].str.contains('KNEELS')
df.loc[mask, 'PassType'] = 0
df.loc[mask, 'PlayType'] = 'QB KNEEL'

mask = (df['PlayType'] == 'PASS') & df['Description'].str.contains('END GAME')
df.loc[mask, 'PassType'] = 0
df.loc[mask, 'PlayType'] = 'NO PLAY'

In [12]:
# replace null values of 'IsIncomplete' with 0
mask = df['IsIncomplete'].isnull()
df[mask] = df[mask].fillna(0)

In [13]:
# replace all of the null values in one faulty row with 0's
mask = df['IsSack'].isnull()
df[mask] = df[mask].fillna(0)

In [14]:
# replace null values of 'IsMeasurement' with 0
df['IsMeasurement'] = df['IsMeasurement'].fillna(0)

In [15]:
mask = df['YardLineFixed'].isnull()
df.loc[mask, 'YardLineFixed'] = 100 - df[mask]['YardLine'] 

In [16]:
# replace null values of 'IsNoPlay' with 0
df['IsNoPlay'] = df['IsNoPlay'].fillna(0)

In [17]:
# replace null values of 'PenaltyYards' with 0
df['PenaltyYards'] = df['PenaltyYards'].fillna(0)

In [18]:
# drop the rest and include description
df.dropna(subset=['OffenseTeam', 'Description'], how='any', inplace = True)

In [19]:
# replace null values of 'Formation' with 0
df['Formation'] = df['Formation'].fillna(0)

In [20]:
# Correct mislabeled rush plays with the right RushDirection
mask = (df['RushDirection'].isnull()) & (df['PlayType'] == 'RUSH') & (df['Description'].str.contains('CENTER'))
df.loc[mask, 'RushDirection'] = 'CENTER'

mask = (df['RushDirection'].isnull()) & (df['PlayType'] == 'RUSH') & (df['Description'].str.contains('LEFT TACKLE'))
df.loc[mask, 'RushDirection'] = 'LEFT TACKLE'

mask = (df['RushDirection'].isnull()) & (df['PlayType'] == 'RUSH') & (df['Description'].str.contains('RIGHT GUARD'))
df.loc[mask, 'RushDirection'] = 'RIGHT GUARD'

mask = (df['RushDirection'].isnull()) & (df['PlayType'] == 'RUSH') & (df['Description'].str.contains('LEFT GUARD'))
df.loc[mask, 'RushDirection'] = 'LEFT GUARD'

mask = (df['RushDirection'].isnull()) & (df['PlayType'] == 'RUSH') & (df['Description'].str.contains('RIGHT TACKLE'))
df.loc[mask, 'RushDirection'] = 'RIGHT TACKLE'

mask = (df['RushDirection'].isnull()) & (df['PlayType'] == 'RUSH') & (df['Description'].str.contains('RIGHT END'))
df.loc[mask, 'RushDirection'] = 'RIGHT END'

mask = (df['RushDirection'].isnull()) & (df['PlayType'] == 'RUSH') & (df['Description'].str.contains('LEFT END'))
df.loc[mask, 'RushDirection'] = 'LEFT END'

# fill the rest with 0
mask = df['RushDirection'].isnull()
df[mask] = df[mask].fillna(0)

In [21]:
# replace null values of 'PenaltyTeam' with 0
df['PenaltyTeam'] = df['PenaltyTeam'].fillna(0)

In [22]:
# replace null values of 'PenaltyType' with 0
df['PenaltyType'] = df['PenaltyType'].fillna(0)

In [23]:
# turn descriptions to unicode to be able to upload to mongodb
# df['Description'] = df['Description'].apply(lambda x: unicode(x, 'utf-8', errors="ignore"))

# To upload data to mongodb, there are limits, will have to divy by team or season and upload to different tables
# db_cilent = MongoClient()
# db = db_cilent['NFL']
# table = db['plays']
# odo(df, db.table)

In [24]:
# replace offenseteam/defenseteam where 'SD' with 'LAC'
mask = df['OffenseTeam'] == 'SD'
df.loc[mask, 'OffenseTeam'] = 'LAC'

mask = df['DefenseTeam'] == 'SD'
df.loc[mask, 'DefenseTeam'] = 'LAC'

In [25]:
def join_teams(values):
    return "_".join(sorted(values))

df['Team1_Team2'] = df[[u'OffenseTeam', u'DefenseTeam']].apply(join_teams, axis=1)

In [26]:
### have to account for dome
### future features: coaches, coordinators, score, last play, month, day_of_week, stadium (maybe, already have location)

In [27]:
weather_df = pd.read_csv('../data/weather.csv')

In [28]:
weather_df['GameDate'] = weather_df['GameDate'].apply(pd.to_datetime)

In [29]:
weather_df['Team1_Team2'] = weather_df['Team1_Team2'].astype(str)

In [30]:
weather_df = weather_df.drop('Stadium', axis=1)

In [31]:
mask = weather_df['Visibility'].isnull()
weather_df.loc[mask, 'Visibility'] = weather_df['Visibility'].mean()

In [32]:
mask = weather_df['Wind'].isnull()
weather_df.loc[mask, 'Wind'] = weather_df['Wind'].mean()

In [33]:
mask = weather_df['Weather_cat'].isnull()
weather_df.loc[mask, 'Weather_cat'] = 'Clear'

In [34]:
mask = (weather_df['Surface'].isnull()) | (weather_df['Surface'] == 'Bermuda') | \
        (weather_df['Surface'] == 'Bluegrass') | (weather_df['Surface'] == 'Kentucky') | \
            (weather_df['Surface'] == 'Natural')
    
weather_df.loc[mask, 'Surface'] = 'Grass'

In [35]:
mask = (weather_df['Surface'] == 'A-Turf') | \
        (weather_df['Surface'] == 'UBU') | (weather_df['Surface'] == 'FieldTurf') | \
            (weather_df['Surface'] == 'RealGrass')
    
weather_df.loc[mask, 'Surface'] = 'Fieldturf'

In [36]:
mask = weather_df['Weather_cat'].str.contains('Cloud')
weather_df.loc[mask, 'Weather_cat'] = 'Cloudy'

mask = weather_df['Weather_cat'].str.contains('Rain') | weather_df['Weather_cat'].str.contains('Drizzle') | \
        weather_df['Weather_cat'].str.contains('Showers') | weather_df['Weather_cat'].str.contains('storm')
weather_df.loc[mask, 'Weather_cat'] = 'Rain'

mask = weather_df['Weather_cat'].str.contains('Fair') | weather_df['Weather_cat'].str.contains('Clear') | \
        weather_df['Weather_cat'].str.contains('Sunny') | weather_df['Weather_cat'].str.contains('Dry') | \
            weather_df['Weather_cat'].str.contains('Breezy') | weather_df['Weather_cat'].str.contains('Humid')
weather_df.loc[mask, 'Weather_cat'] = 'Clear'

mask = weather_df['Weather_cat'].str.contains('Fog') | weather_df['Weather_cat'].str.contains('Overcast')
weather_df.loc[mask, 'Weather_cat'] = 'Overcast' 

mask = weather_df['Weather_cat'].str.contains('Snow') | weather_df['Weather_cat'].str.contains('Wintry Mix') | \
        weather_df['Weather_cat'].str.contains('Flurries')
weather_df.loc[mask, 'Weather_cat'] = 'Snow'

In [37]:
combined_df = pd.merge(df, weather_df, how='left', left_on = ['GameDate', 'Team1_Team2'], right_on = ['GameDate', 'Team1_Team2'])

In [38]:
mask = (combined_df['Description'].str.contains('TOUCHDOWN')) | (combined_df['IsTouchdown'] == 1)
combined_df['IsTouchdown'] = np.where(mask, 1, 0)

mask_2 = combined_df['Description'].str.contains('FIELD GOAL IS GOOD')
combined_df['IsFieldGoal'] = np.where(mask_2, 1, 0)

mask_3 = combined_df['Description'].str.contains('EXTRA POINT IS GOOD')
combined_df['IsExtraPoint'] = np.where(mask_3, 1, 0)

mask_4 = ((combined_df['Description'].str.contains('TWO-POINT CONVERSION')) & (combined_df['Description'].str.contains('ATTEMPT SUCCEEDS'))) | (combined_df['IsTwoPointConversionSuccessful'] == 1)
combined_df['IsTwoPointConversion'] = np.where(mask_4, 1, 0)

mask_5 = combined_df['Description'].str.contains('SAFETY')
combined_df['IsSafety'] = np.where(mask_5, 1, 0)

In [39]:
combined_df['IsHome'] = np.where(combined_df['Home_team'] == combined_df['OffenseTeam'], 1, 0)
combined_df['Home_score'] = 0
combined_df['Away_score'] = 0

mask_home = (combined_df['IsTouchdown'] == 1) & (combined_df['IsHome'] == 1)
combined_df.loc[mask_home, 'Home_score'] = 6
mask_away = (combined_df['IsTouchdown'] == 1) & (combined_df['IsHome'] == 0)
combined_df.loc[mask_away, 'Away_score'] = 6

mask_home = (combined_df['IsFieldGoal'] == 1) & (combined_df['IsHome'] == 1)
combined_df.loc[mask_home, 'Home_score'] = 3
mask_away = (combined_df['IsFieldGoal'] == 1) & (combined_df['IsHome'] == 0)
combined_df.loc[mask_away, 'Away_score'] = 3

mask_home = (combined_df['IsExtraPoint'] == 1) & (combined_df['IsHome'] == 1)
combined_df.loc[mask_home, 'Home_score'] = 1
mask_away = (combined_df['IsExtraPoint'] == 1) & (combined_df['IsHome'] == 0)
combined_df.loc[mask_away, 'Away_score'] = 1

mask_home = (combined_df['IsTwoPointConversion'] == 1) & (combined_df['IsHome'] == 1)
combined_df.loc[mask_home, 'Home_score'] = 2
mask_away = (combined_df['IsTwoPointConversion'] == 1) & (combined_df['IsHome'] == 0)
combined_df.loc[mask_away, 'Away_score'] = 2

mask_home = (combined_df['IsSafety'] == 1) & (combined_df['IsHome'] == 1)
combined_df.loc[mask_home, 'Home_score'] = 2
mask_away = (combined_df['IsSafety'] == 1) & (combined_df['IsHome'] == 0)
combined_df.loc[mask_away, 'Away_score'] = 2

In [40]:
off_df = pd.read_csv('../data/offensive_coach_data.csv')
def_df = pd.read_csv('../data/defensive_coach_data.csv')

off_df['SeasonYear'] = off_df['SeasonYear'].astype(float)
def_df['SeasonYear'] = off_df['SeasonYear'].astype(float)
new_combined_df = pd.merge(combined_df, off_df, how='left', left_on=['SeasonYear', 'OffenseTeam'], right_on=['SeasonYear', 'OffenseTeam'])
new_combined_df = pd.merge(new_combined_df, def_df, how='left', left_on=['SeasonYear', 'DefenseTeam'], right_on=['SeasonYear', 'DefenseTeam'])

In [41]:
new_combined_df['Home_total_score'] = None
new_combined_df['Away_total_score'] = None

unique_games = new_combined_df['GameId'].unique()
for game in unique_games:
    mask = new_combined_df['GameId'] == game
    game_df = combined_df[mask]
    game_df = game_df.sort_values([u'Quarter', u'Minute', u'Second'])
    new_combined_df.loc[mask, 'Home_total_score'] = game_df['Home_score'].cumsum()
    new_combined_df.loc[mask, 'Away_total_score'] = game_df['Away_score'].cumsum()

In [45]:
# new_combined_df.to_csv('../data/combined_data.csv', index=False)

In [47]:
new_combined_df['Score_differential'] = 0

mask = new_combined_df['IsHome'] == 1
new_combined_df.loc[mask, 'Score_differential'] = new_combined_df['Home_total_score'] - new_combined_df['Away_total_score']

mask = new_combined_df['IsHome'] == 0
new_combined_df.loc[mask, 'Score_differential'] = new_combined_df['Away_total_score'] - new_combined_df['Home_total_score']

In [49]:
new_combined_df[['OffenseTeam', 'DefenseTeam', 'IsHome', 'Home_total_score', 'Away_total_score', 'Score_differential']]

Unnamed: 0,OffenseTeam,DefenseTeam,IsHome,Home_total_score,Away_total_score,Score_differential
0,HOU,LAC,1,23,28,-5
1,WAS,GB,1,0,10,-10
2,MIN,CHI,1,0,7,-7
3,HOU,SEA,0,6,26,20
4,DET,CHI,0,22,30,8
5,DET,CHI,0,22,30,8
6,TEN,NYJ,0,6,24,18
7,TEN,NYJ,0,6,24,18
8,NE,ATL,1,19,10,9
9,NE,ATL,1,19,10,9
