### Chi Squared Test and Older Data Preprocessing Script

* __NOTE:__ this file uses our original preprocessing script, which is slightly different from our finalized preprocessing script that is found in "ECS171project_final_data_preprocessing.csv". 
* We decided to keep using this older version as it works to generate the chi-squared tests and shows our process working towards our final preprocessor. 

### Data Preprocessing

In [2]:
import pandas as pd
import os
import numpy as np
import seaborn as sb
from numpy import array
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import chi2_contingency, chi2

nfl = pd.read_csv("NFL Play by Play 2009-2018 (v5).csv")

#drop unneeded columns
nfl=nfl.drop(columns=['air_epa',
 'air_wpa',
 'air_yards',
 'assist_tackle',
 'assist_tackle_1_player_id',
 'assist_tackle_1_player_name',
 'assist_tackle_1_team',
 'assist_tackle_2_player_id',
 'assist_tackle_2_player_name',
 'assist_tackle_2_team',
 'assist_tackle_3_player_id',
 'assist_tackle_3_player_name',
 'assist_tackle_3_team',
 'assist_tackle_4_player_id',
 'assist_tackle_4_player_name',
 'assist_tackle_4_team',
 'away_timeouts_remaining',
 'away_wp',
 'away_wp_post',
 'blocked_player_id',
 'blocked_player_name',
 'comp_air_epa',
 'comp_air_wpa',
 'comp_yac_epa',
 'comp_yac_wpa',
 'complete_pass',
 'def_wp',
 'defensive_extra_point_attempt',
 'defensive_extra_point_conv',
 'defensive_two_point_attempt',
 'defensive_two_point_conv',
 'defteam_score',
 'defteam_score_post',
 'defteam_timeouts_remaining',
 'desc',
 'ep',
 'epa',
 'extra_point_attempt',
 'extra_point_prob',
 'extra_point_result',
 'fg_prob',
 'field_goal_attempt',
 'field_goal_result',
 'first_down_pass',
 'first_down_penalty',
 'first_down_rush',
 'forced_fumble_player_1_player_id',
 'forced_fumble_player_1_player_name',
 'forced_fumble_player_1_team',
 'forced_fumble_player_2_player_id',
 'forced_fumble_player_2_player_name',
 'forced_fumble_player_2_team',
 'fourth_down_converted',
 'fourth_down_failed',
 'fumble',
 'fumble_forced',
 'fumble_lost',
 'fumble_not_forced',
 'fumble_out_of_bounds',
 'fumble_recovery_1_player_id',
 'fumble_recovery_1_player_name',
 'fumble_recovery_1_team',
 'fumble_recovery_1_yards',
 'fumble_recovery_2_player_id',
 'fumble_recovery_2_player_name',
 'fumble_recovery_2_team',
 'fumble_recovery_2_yards',
 'fumbled_1_player_id',
 'fumbled_1_player_name',
 'fumbled_1_team',
 'fumbled_2_player_id',
 'fumbled_2_player_name',
 'fumbled_2_team',
 'home_timeouts_remaining',
 'home_wp',
 'home_wp_post',
 'incomplete_pass',
 'interception',
 'interception_player_id',
 'interception_player_name',
 'kick_distance',
 'kicker_player_id',
 'kicker_player_name',
 'kickoff_attempt',
 'kickoff_downed',
 'kickoff_fair_catch',
 'kickoff_in_endzone',
 'kickoff_inside_twenty',
 'kickoff_out_of_bounds',
 'kickoff_returner_player_id',
 'kickoff_returner_player_name',
 'lateral_interception_player_id',
 'lateral_interception_player_name',
 'lateral_kickoff_returner_player_id',
 'lateral_kickoff_returner_player_name',
 'lateral_punt_returner_player_id',
 'lateral_punt_returner_player_name',
 'lateral_receiver_player_id',
 'lateral_receiver_player_name',
 'lateral_reception',
 'lateral_recovery',
 'lateral_return',
 'lateral_rush',
 'lateral_rusher_player_id',
 'lateral_rusher_player_name',
 'lateral_sack_player_id',
 'lateral_sack_player_name',
 'no_huddle',
 'no_score_prob',
 'opp_fg_prob',
 'opp_safety_prob',
 'opp_td_prob',
 'own_kickoff_recovery',
 'own_kickoff_recovery_player_id',
 'own_kickoff_recovery_player_name',
 'own_kickoff_recovery_td',
 'pass_attempt',
 'pass_defense_1_player_id',
 'pass_defense_1_player_name',
 'pass_defense_2_player_id',
 'pass_defense_2_player_name',
 'pass_length',
 'pass_location',
 'pass_touchdown',
 'passer_player_id',
 'passer_player_name',
 'penalty',
 'penalty_player_id',
 'penalty_player_name',
 'penalty_team',
 'penalty_type',
 'penalty_yards',
 'posteam_score',
 'posteam_score_post',
 'posteam_timeouts_remaining',
 'punt_attempt',
 'punt_blocked',
 'punt_downed',
 'punt_fair_catch',
 'punt_in_endzone',
 'punt_inside_twenty',
 'punt_out_of_bounds',
 'punt_returner_player_id',
 'punt_returner_player_name',
 'punter_player_id',
 'punter_player_name',
 'qb_dropback',
 'qb_hit',
 'qb_hit_1_player_id',
 'qb_hit_1_player_name',
 'qb_hit_2_player_id',
 'qb_hit_2_player_name',
 'qb_kneel',
 'qb_scramble',
 'qb_spike',
 'receiver_player_id',
 'receiver_player_name',
 'replay_or_challenge',
 'replay_or_challenge_result',
 'return_team',
 'return_touchdown',
 'return_yards',
 'run_gap',
 'run_location',
 'rush_attempt',
 'rush_touchdown',
 'rusher_player_id',
 'rusher_player_name',
 'sack',
 'safety',
 'safety_prob',
 'score_differential_post',
 'shotgun',
 'solo_tackle',
 'solo_tackle_1_player_id',
 'solo_tackle_1_player_name',
 'solo_tackle_1_team',
 'solo_tackle_2_player_id',
 'solo_tackle_2_player_name',
 'solo_tackle_2_team',
 'tackle_for_loss_1_player_id',
 'tackle_for_loss_1_player_name',
 'tackle_for_loss_2_player_id',
 'tackle_for_loss_2_player_name',
 'tackled_for_loss',
 'td_prob',
 'td_team',
 'third_down_converted',
 'third_down_failed',
 'timeout',
 'timeout_team',
 'total_away_comp_air_epa',
 'total_away_comp_air_wpa',
 'total_away_comp_yac_epa',
 'total_away_comp_yac_wpa',
 'total_away_epa',
 'total_away_pass_epa',
 'total_away_pass_wpa',
 'total_away_raw_air_epa',
 'total_away_raw_air_wpa',
 'total_away_raw_yac_epa',
 'total_away_raw_yac_wpa',
 'total_away_rush_epa',
 'total_away_rush_wpa',
 'total_away_score',
 'total_home_comp_air_epa',
 'total_home_comp_air_wpa',
 'total_home_comp_yac_epa',
 'total_home_comp_yac_wpa',
 'total_home_epa',
 'total_home_pass_epa',
 'total_home_pass_wpa',
 'total_home_raw_air_epa',
 'total_home_raw_air_wpa',
 'total_home_raw_yac_epa',
 'total_home_raw_yac_wpa',
 'total_home_rush_epa',
 'total_home_rush_wpa',
 'total_home_score',
 'touchdown',
 'two_point_attempt',
 'two_point_conv_result',
 'two_point_conversion_prob',
 'wp',
 'wpa',
 'yac_epa',
 'yac_wpa',
 'yards_after_catch',
 'game_date',
 'time'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
# Backup created so that play type can be preserved
nfl_backup = nfl.dropna(subset = ["play_type"])

# drop null values in play_type
nfl1 = nfl.dropna(subset = ["play_type"])

# removing rows that have play type that are not kickoffs and not no_play
nfl1 = nfl1[nfl1.play_type != "kickoff"]
nfl1 = nfl1[nfl1.play_type != "no_play"]

# replace extra_point, field_goal, and punt with 'kick'
nfl1['play_type'] = nfl1['play_type'].replace({'extra_point': 'kick'})
nfl1['play_type'] = nfl1['play_type'].replace({'field_goal': 'kick'})
nfl1['play_type'] = nfl1['play_type'].replace({'punt': 'kick'})
nfl1 = nfl1.drop(["play_id", "game_id", "half_seconds_remaining", "game_half", "quarter_end", "drive", "sp", "goal_to_go", "ydsnet", "yards_gained"], axis = 1)

nfl1.head()

Unnamed: 0,home_team,away_team,posteam,posteam_type,defteam,side_of_field,yardline_100,quarter_seconds_remaining,game_seconds_remaining,qtr,down,yrdln,ydstogo,play_type,score_differential
1,PIT,TEN,PIT,home,TEN,PIT,58.0,893.0,3593.0,1,1.0,PIT 42,10,pass,0.0
2,PIT,TEN,PIT,home,TEN,PIT,53.0,856.0,3556.0,1,2.0,PIT 47,5,run,0.0
3,PIT,TEN,PIT,home,TEN,PIT,56.0,815.0,3515.0,1,3.0,PIT 44,8,pass,0.0
4,PIT,TEN,PIT,home,TEN,PIT,56.0,807.0,3507.0,1,4.0,PIT 44,8,kick,0.0
5,PIT,TEN,TEN,away,PIT,TEN,98.0,796.0,3496.0,1,1.0,TEN 2,10,run,0.0


### One-Hot Encoding

In [4]:
# one hot encode HOME_TEAM
# convert to an array
nfl_home = array(nfl1["home_team"])

# encode as integers
home_encoder = LabelEncoder()
home_encoded =  home_encoder.fit_transform(nfl_home) 
 
# binary encode
home_onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type
# reshape the array
home_encoded = home_encoded.reshape(len(home_encoded), 1) 
home_onehot_encoded = home_onehot_encoder.fit_transform(home_encoded) 

nfl1["home_team"] = home_onehot_encoded.tolist()

# one hot encode AWAY_TEAM
# convert to an array
nfl_away = array(nfl1["away_team"])

# encode as integers
away_encoder = LabelEncoder()
away_encoded =  away_encoder.fit_transform(nfl_away) 
 
# binary encode
away_onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type
# reshape the array
away_encoded = away_encoded.reshape(len(away_encoded), 1) 
away_onehot_encoded = away_onehot_encoder.fit_transform(away_encoded) 

nfl1["away_team"] = away_onehot_encoded.tolist()

# one hot encode QTR
# convert to an array
quarter = array(nfl1["qtr"])

# encode as integers
quarter_encoder = LabelEncoder()
quarter_encoded =  quarter_encoder.fit_transform(quarter) 

# binary encode
quarter_onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type
# reshape the array
quarter_encoded = quarter_encoded.reshape(len(quarter_encoded), 1) 
quarter_onehot_encoded = quarter_onehot_encoder.fit_transform(quarter_encoded)

nfl1["qtr"] = quarter_onehot_encoded.tolist()

# one hot encode POSTEAM
# convert to an array
posteam = array(nfl1["posteam"])

# encode as integers
posteam_encoder = LabelEncoder()
posteam_encoded =  posteam_encoder.fit_transform(posteam) 

# binary encode
posteam_onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type
# reshape the array
posteam_encoded = posteam_encoded.reshape(len(posteam_encoded), 1) 
posteam_onehot_encoded = posteam_onehot_encoder.fit_transform(posteam_encoded)

nfl1["posteam"] = posteam_onehot_encoded.tolist()

# one hot encode PLAY_TYPE
# convert to an array
nfl_home = array(nfl1["play_type"])

# encode as integers
home_encoder = LabelEncoder()
home_encoded =  home_encoder.fit_transform(nfl_home) 
 
# binary encode
home_onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type
# reshape the array
home_encoded = home_encoded.reshape(len(home_encoded), 1) 
home_onehot_encoded = home_onehot_encoder.fit_transform(home_encoded) 

In [5]:
nfl1["home_team"] = home_onehot_encoded.tolist()
nfl1["away_team"] = away_onehot_encoded.tolist()
nfl1["qtr"] = quarter_onehot_encoded.tolist()
nfl1["posteam"] = posteam_onehot_encoded.tolist()
nfl1["play_type"] = home_onehot_encoded.tolist()

In [6]:
nfl1.columns

Index(['home_team', 'away_team', 'posteam', 'posteam_type', 'defteam',
       'side_of_field', 'yardline_100', 'quarter_seconds_remaining',
       'game_seconds_remaining', 'qtr', 'down', 'yrdln', 'ydstogo',
       'play_type', 'score_differential'],
      dtype='object')

### Chi Squared Tests

In [7]:
#Data visualization
#This version will compare categorical variables with the chi squared test
nfl_copy = nfl1.copy()

#play_type vs. home_team
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['home_team'], columns=nfl_backup['play_type'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq)
#Determine correlation with 95% confidence
if (chiSq[1] > 0.05):
    print('We are 95% confident that play_type and home_team are not correlated.')
else:
    print('We are 95% confident that play_type and home_team are correlated.')

#play_type vs. away_team
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['away_team'], columns=nfl_backup['play_type'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq[1])
#Determine correlation with 95% confidence
if (chiSq[1] > 0.05):
    print('We are 95% confident that play_type and away_team are not correlated.')
else:
    print('We are 95% confident that play_type and away_team are correlated.')

#play_type vs. posteam
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['posteam'], columns=nfl_backup['play_type'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq[1])
#Determine correlation with 95% confidence
if (chiSq[1] > 0.05):
    print('We are 95% confident that play_type and posteam are not correlated.')
else:
    print('We are 95% confident that play_type and posteam are correlated.')
    
#play_type vs. posteam_type
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['posteam_type'], columns=nfl_backup['play_type'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq[1])
#Determine correlation with 95% confidence
if (chiSq[1] > 0.05):
    print('We are 95% confident that play_type and posteam_type are not correlated.')
else:
    print('We are 95% confident that play_type and posteam_type are correlated.')
    
#play_type vs. defteam
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['defteam'], columns=nfl_backup['play_type'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq[1])
#Determine correlation with 95% confidence
if (chiSq[1] > 0.05):
    print('We are 95% confident that play_type and defteam are not correlated.')
else:
    print('We are 95% confident that play_type and defteam are correlated.')
    
#play_type vs. side_of_field
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['side_of_field'], columns=nfl_backup['play_type'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq[1])
#Determine correlation with 95% confidence
if (chiSq[1] > 0.05):
    print('We are 95% confident that play_type and side_of_field are not correlated.')
else:
    print('We are 95% confident that play_type and side_of_field are correlated.')
    
#play_type vs. qtr
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['qtr'], columns=nfl_backup['play_type'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq[1])
#Determine correlation with 95% confidence
if (chiSq[1] > 0.05):
    print('We are 95% confident that play_type and qtr are not correlated.')
else:
    print('We are 95% confident that play_type and qtr are correlated.')
    
#Test correlation: qtr vs. home_team
print("The following comparison is a test.")
#Cross tabulation
crossTab = pd.crosstab(index=nfl_backup['home_team'], columns=nfl_backup['qtr'])
#Optional print
#print(crossTab)
#chi squared test
chiSq = chi2_contingency(crossTab)
#print(chiSq[1])
#Determine correlation with 95% confidence
critical = chi2.ppf(0.95, chiSq[2])
if (chiSq[1] > 0.05):
    print('We are 95% confident that qtr and home_team are not correlated.')
else:
    print('We are 95% confident that qtr and home_team are correlated.')
if (abs(chiSq[0]) >= critical):
    print("qtr and home_team are correlated.")
else:
    print("qtr and home_team are not correlated.")

We are 95% confident that play_type and home_team are correlated.
We are 95% confident that play_type and away_team are correlated.
We are 95% confident that play_type and posteam are correlated.
We are 95% confident that play_type and posteam_type are correlated.
We are 95% confident that play_type and defteam are correlated.
We are 95% confident that play_type and side_of_field are correlated.
We are 95% confident that play_type and qtr are correlated.
The following comparison is a test.
We are 95% confident that qtr and home_team are correlated.
qtr and home_team are correlated.
