### Feature Engineering

#### Defensive Features

- 1, 3, and 5 week moving averages of defensive stats. Necessitates dropping first game from training set. If not enough games are present, we'll replicate the 1-day average.
- 1, 3, 5 week minimums
- 1, 3, 5 week maximums

In [1]:
import numpy as np
import pandas as pd
import pytz
from sklearn.preprocessing import MinMaxScaler

In [2]:
def generate_defense_moving_features(player_df, defense_df, windows=[1,3,5]):
    moving_cols = defense_df.columns[2:]
    defense_df_cp = defense_df.copy().rename(columns={'team':'opp'})
    defense_df_cp.sort_values(['opp', 'date'], inplace=True)
    for col in moving_cols:
        for window in windows:
            defense_df_cp['defense_{}_moving_avg_{}'.format(col,window)] = defense_df_cp.groupby('opp')['{}'.format(col)].transform(lambda x: x.rolling(window, 1).mean().shift())
            defense_df_cp['defense_{}_moving_max_{}'.format(col,window)] = defense_df_cp.groupby('opp')['{}'.format(col)].transform(lambda x: x.rolling(window, 1).max().shift())
            defense_df_cp['defense_{}_moving_min_{}'.format(col,window)] = defense_df_cp.groupby('opp')['{}'.format(col)].transform(lambda x: x.rolling(window, 1).min().shift())
        defense_df_cp.drop(col, axis=1, inplace=True)
    return player_df.merge(defense_df_cp, how='inner', on=['opp','date'])

#### Weather Features
- Convert 'detail' to 'inclement' flag
- Keep wind and temperature as is, but scale

In [3]:
def generate_weather_features(player_df):
    player_df_cp = player_df.copy()
    player_df_cp['weather_inclement'] = ~((player_df_cp['weather_detail']=='DOME') | (pd.isnull(player_df_cp['weather_detail'])))
    player_df_cp.drop(['weather_detail'], axis=1, inplace=True)
    return player_df_cp

#### Twitter Features
- Count player tweets in week before game
- Count opponent tweets in week before game
- % change in player tweets 3 days and 1 day before game
- % change in opponent tweets 3 days and 1 day before game
- Net sentiment player tweets in week before game
- Net sentiment opponent tweets in week before game
- Pct player opponent tweets
- Pct neutral opponent tweets
- % Change in net sentiment player tweets between 3 and 1 days of game
- % Change in net sentiment opponent tweets between 3 and 1 days of game
- % Change in pct neutral in player tweets between 3 and 1 days of game
- % Change in pct neutral in player opponent between 3 and 1 days of game

In [15]:
def prep_twitter_data(player_df, player_team, tweets_df):
    # Get total tweets in a week for percentage stats
    tweets_df['year_week'] = [week[0] + week[1] for week in zip(map(str, tweets_df['tweet_time_pac'].dt.year), map(str, tweets_df['week']))]
    weekly_counts = tweets_df.groupby('year_week').agg({'tweet_id':'count'}).reset_index().rename(columns={'tweet_id': 'count_weekly_tweets'})
    tweets_df = tweets_df.merge(weekly_counts, how='inner', on='year_week')
    
    tweets_df_filtered = tweets_df.loc[(tweets_df['team']==player_team) | (tweets_df['opp']==player_team)].copy()
    tweets_df_filtered['opp'] = [teams[0] if teams[0] != player_team else teams[1] for teams in zip(tweets_df_filtered['team'], tweets_df_filtered['opp'])]
    tweets_df_filtered['tweet_for_player'] = [1 if team==player_team else 0 for team in tweets_df_filtered['team']]
    tweets_df_filtered['team'] = player_team
    tweets_df_filtered.drop([
        'home_away', 
        'score', 
        'opponent_score', 
        'point_spread', 
        'over_under',
        'year_week'
        ], axis=1, inplace=True)
    return tweets_df_filtered

In [16]:
twitter_features = [
            'twitter_pct_player_tweets',
            'twitter_pct_opponent_tweets',
            'twitter_count_player_swing_1_3',
            'twitter_count_opponent_swing_1_3',
            'twitter_player_net_sentiment',
            'twitter_opponent_net_sentiment',
            'twitter_player_pct_neutral',
            'twitter_opponent_pct_neutral',
            'twitter_net_sentiment_player_swing_1_3',
            'twitter_net_sentiment_opponent_swing_1_3',
            'twitter_pct_neutral_player_swing_1_3',
            'twitter_pct_neutral_opponent_swing_1_3'
        ]

def generate_twitter_features(player_df, player_team, tweets_df, twitter_features):
    tweets_df_mod = prep_twitter_data(player_df, player_team, tweets_df)
    player_df['team'] = player_team
    player_df['year_week'] = [week[0] + week[1] for week in zip(map(str, player_df['date'].dt.year), map(str, player_df['week']))]
    tweets_df_mod = tweets_df_mod.merge(player_df, how='inner', on=['team', 'opp'])
    
    # Drop tweets from matchups not in the same time period
    tweets_df_mod['timediff'] = tweets_df_mod['date'] - tweets_df_mod['tweet_time_pac']
    tweets_df_mod = tweets_df_mod.loc[(tweets_df_mod['timediff']>pd.Timedelta('0 days')) & (tweets_df_mod['timediff']<pd.Timedelta('5 days'))]    
    tweets_df_mod.drop_duplicates(inplace=True)
    
    def aggregate_features(df):
        features = {}
        features['twitter_pct_player_tweets'] = df.loc[df['tweet_for_player']==1]['tweet_id'].count() / df['count_weekly_tweets'].max()
        features['twitter_pct_opponent_tweets'] = df.loc[df['tweet_for_player']==0]['tweet_id'].count() / df['count_weekly_tweets'].max()
        features['count_player_1'] = df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==1)]['tweet_id'].count()
        features['count_player_3'] = df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==3)]['tweet_id'].count()
        features['twitter_count_player_swing_1_3'] = (features['count_player_1'] - features['count_player_3']) / features['count_player_3']
        features['count_opponent_1'] = df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==1)]['tweet_id'].count()
        features['count_opponent_3'] = df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==3)]['tweet_id'].count()
        features['twitter_count_opponent_swing_1_3'] = (features['count_opponent_1'] - features['count_opponent_3']) / features['count_opponent_3']
        features['twitter_player_net_sentiment'] = df.loc[df['tweet_for_player']==1]['sentiment'].mean()
        features['twitter_opponent_net_sentiment'] = df.loc[df['tweet_for_player']==0]['sentiment'].mean()
        features['twitter_player_pct_neutral'] = sum(df.loc[df['tweet_for_player']==1]['sentiment']==0) / df.loc[df['tweet_for_player']==1]['sentiment'].count()
        features['twitter_opponent_pct_neutral'] = sum(df.loc[df['tweet_for_player']==0]['sentiment']==0) / df.loc[df['tweet_for_player']==1]['sentiment'].count()
        features['net_sentiment_player_1'] = df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==1)]['sentiment'].mean()
        features['net_sentiment_player_3'] = df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==3)]['sentiment'].mean()
        features['twitter_net_sentiment_player_swing_1_3'] = (features['net_sentiment_player_1'] - features['net_sentiment_player_3']) / (features['net_sentiment_player_3'])
        features['net_sentiment_opponent_1'] = df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==1)]['sentiment'].mean()
        features['net_sentiment_opponent_3'] = df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==3)]['sentiment'].mean()
        features['twitter_net_sentiment_opponent_swing_1_3'] = (features['net_sentiment_opponent_1'] - features['net_sentiment_opponent_3']) / (features['net_sentiment_opponent_3'])
        features['pct_neutral_player_1'] = sum(df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==3)]['sentiment']==0) / df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==3)]['sentiment'].count()
        features['pct_neutral_player_3'] = sum(df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==1)]['sentiment']==0) / df.loc[(df['tweet_for_player']==1) & (df['timediff'].dt.days==1)]['sentiment'].count()
        features['twitter_pct_neutral_player_swing_1_3'] = (features['pct_neutral_player_1'] - features['pct_neutral_player_3']) / (features['pct_neutral_player_3'])
        features['pct_neutral_opponent_1'] = sum(df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==3)]['sentiment']==0) / df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==3)]['sentiment'].count()
        features['pct_neutral_opponent_3'] = sum(df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==1)]['sentiment']==0) / df.loc[(df['tweet_for_player']==0) & (df['timediff'].dt.days==1)]['sentiment'].count()
        features['twitter_pct_neutral_opponent_swing_1_3'] = (features['pct_neutral_opponent_1'] - features['pct_neutral_opponent_3']) / (features['pct_neutral_opponent_3'])
                                                      
        features_ser = pd.Series(features, index=twitter_features)
        return features_ser
    
    
    tweets_df_mod_agg = tweets_df_mod.groupby(['year_week']).apply(aggregate_features).reset_index()
    final_df = player_df.merge(tweets_df_mod_agg, how='inner', on='year_week')
    final_df.drop(['year_week', 'team'], axis=1, inplace=True)
    return final_df

#### Player Features
- 1, 3, and 5 week averages of fantasy points

In [17]:
def generate_player_moving_features(player_df, windows=[1,3,5]):
    moving_cols = ['fantpt']
    player_df_cp = player_df.copy()
    for col in moving_cols:
        for window in windows:
            player_df_cp['player_{}_moving_avg_{}'.format(col,window)] = player_df_cp['{}'.format(col)].transform(lambda x: x.rolling(window, 1).mean().shift())
            player_df_cp['player_{}_moving_max_{}'.format(col,window)] = player_df_cp['{}'.format(col)].transform(lambda x: x.rolling(window, 1).max().shift())
            player_df_cp['player_{}_moving_min_{}'.format(col,window)] = player_df_cp['{}'.format(col)].transform(lambda x: x.rolling(window, 1).min().shift())
            player_df_cp.rename(columns={'home':'player_home'}, inplace=True)
    return player_df_cp

#### Betting features
- Change 'favorite spread' to just 'spread' for relevant team

In [18]:
def generate_betting_features(player_df):
    player_df['betting_spread'] = [info[1] if info[0]==1 else abs(info[1]) for info in zip(player_df['favorite'], player_df['spread_favorite'])]
    player_df.rename(columns = {'over_under_line':'betting_over_under_line'}, inplace=True)
    return player_df.drop(['spread_favorite', 'favorite'], axis=1)

#### Generate features 

In [19]:
# Data for baseline modeling
defense = pd.read_csv('../data/data_modified/teams/defensive_stats.csv')
brady = pd.read_csv('../data/data_modified/players/brady.csv')
mccoy = pd.read_csv('../data/data_modified/players/mccoy.csv')

defense['date']=pd.to_datetime(defense['date'])
# Need to increment the year for January games by 1 because sportsreference API tags games by
# season rather than actual date
defense['date'] = [date.replace(year=date.year + 1) if date.month==1 else date for date in defense['date']]
brady['date']=pd.to_datetime(brady['date'])
mccoy['date']=pd.to_datetime(mccoy['date'])

In [20]:
# Add Twitter features for final modeling
tweets = pd.read_csv('../data/data_modified/tweets/tweets_sentiment.csv')

# Convert time columns to datetime
# Convert UTC to Pacific time (to be safe - we don't have timestamps for the games,
# but if we assume west coast time (where days start the latest), we will be sure to 
# avoid leakage)
tweets['tweet_UTCtime'] = pd.to_datetime(tweets['tweet_UTCtime'])
tweets.set_index(['tweet_UTCtime'], inplace=True)
tweets['tweet_time_pac'] = tweets.index.tz_localize(pytz.utc).tz_convert('US/Pacific')

# Rename cols
tweets.rename(columns={'opponent':'opp'}, inplace=True)
tweets.reset_index(drop=True, inplace=True)
tweets['tweet_time_pac'] = tweets['tweet_time_pac'].dt.tz_localize(None)

In [21]:
df_features=[]
for key, df in {'NWE':brady, 'PHI':mccoy}.items():
    feature_df = generate_defense_moving_features(df, defense)
    feature_df = generate_weather_features(feature_df)
    feature_df = generate_player_moving_features(feature_df)
    feature_df = generate_betting_features(feature_df)
    feature_df = generate_twitter_features(feature_df, key, tweets, twitter_features)
    target = feature_df['fantpt']
    feature_df.drop(['week', 'date', 'opp', 'fantpt'], axis=1, inplace=True)
    feature_df = pd.concat([target,feature_df], axis=1)
    feature_df.rename(columns={'fantpt':'target'}, inplace=True)
    feature_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df_features.append(feature_df)
brady_all_features = df_features[0]
mccoy_all_features = df_features[1]
brady_baseline_features = df_features[0].drop(twitter_features, axis=1)
mccoy_baseline_features = df_features[1].drop(twitter_features, axis=1)



In [25]:
# Save unscaled features for XGBoost
# Scale features, drop moving averages, and save for RNN
scaler = MinMaxScaler()
for player, feat_dict in {'brady':{'baseline': brady_baseline_features, 'final':brady_all_features}, 'mccoy':{'baseline':mccoy_baseline_features, 'final':mccoy_all_features}}.items():
        for feat_set, value in feat_dict.items():
            value_cp = value.copy()
            value_cp.drop(0, axis=0).to_csv('../data/data_final/{}/features_raw/{}.csv'.format(feat_set, player), index=False)
            value_cp = value_cp.loc[:,['moving' not in col for col in value_cp.columns]]
            scaler.fit(value_cp)
            pd.concat([value_cp.iloc[:,0], pd.DataFrame(scaler.transform(value_cp)).iloc[:,1:]], axis=1).drop(0, axis=0).to_csv('../data/data_final/{}/features_scaled/{}.csv'.format(feat_set, player), index=False)
                          

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
