In [1]:
import pandas as pd
import numpy as np

In [2]:
ind_games = pd.read_csv("data/final_data/all_games_16_18.csv")

In [3]:
ind_games = ind_games.drop('Team2_int_re_number',axis=1)

In [4]:
clean_ind_games = ind_games.drop(['game_id','Team1_team_id','Team2_team_id','Team1','Team2'],axis=1)

### Target split

In [5]:
clean_ind_games['Team2_win'].value_counts()/clean_ind_games.shape[0]

1    0.622561
0    0.377439
Name: Team2_win, dtype: float64

### Re-shuffle dataframe to get equal target variables 

In [6]:
target_yes = clean_ind_games[clean_ind_games['Team2_win'] == 1]
target_no = clean_ind_games[clean_ind_games['Team2_win']==0]

In [7]:
sample_no = int((target_yes.shape[0] - target_no.shape[0])/2)

In [8]:
flip_df = target_yes.sample(n=sample_no)
remove_index = flip_df.index.tolist()

In [9]:
flip_df.head()

Unnamed: 0,Team1_team_score,Team2_team_score,Team1_first_downs,Team1_passing_first_downs,Team1_rushing_first_downs,Team1_rushing_yds,Team1_rushing_attempts,Team1_passing_attempts,Team1_passing_completions,Team1_passing_interceptions,...,Team2_third_down_suc,Team2_fourth_down_attempts,Team2_fourth_down_suc,Team1_third_down_conv_pct,Team2_third_down_conv_pct,Team1_fourth_down_conv_pct,Team2_fourth_down_conv_pct,Team1_turnover_margin,Team2_turnover_margin,Team2_win
347,13,26,15,4,8,91,31,28,12,1,...,6,1,1,0.357143,0.375,0.0,1.0,-1,1,1
1890,24,31,24,12,9,158,48,31,16,1,...,7,1,1,0.333333,0.538462,0.666667,1.0,1,-1,1
2149,24,47,25,6,19,96,21,65,38,1,...,4,2,1,0.526316,0.333333,0.0,0.5,1,-1,1
162,20,27,15,11,2,310,49,21,5,3,...,3,2,1,0.294118,0.230769,0.25,0.5,3,-3,1
868,14,56,19,10,7,157,41,21,11,1,...,9,0,0,0.416667,0.75,0.0,0.0,1,-1,1


In [10]:
orig_cols = flip_df.columns.tolist()

In [11]:
target_yes = target_yes.drop(remove_index)

In [12]:
### Rename and flip values
t1_cols_rename = flip_df[flip_df.columns[flip_df.columns.str.contains('Team1')]].columns.tolist()
sep_df = flip_df[t1_cols_rename]
t1_cols_rename = [word.replace('1','2') for word in t1_cols_rename]
sep_df.columns = t1_cols_rename

t2_cols_rename = flip_df[flip_df.columns[flip_df.columns.str.contains('Team2')]].columns.tolist()
sep_df2 = flip_df[t2_cols_rename]
t2_cols_rename = [word.replace('2','1') for word in t2_cols_rename]
sep_df2.columns = t2_cols_rename

In [13]:
flip_df = pd.concat([sep_df,sep_df2],axis=1)
flip_df = flip_df.drop('Team1_win',axis=1)
flip_df['Team2_win'] = np.where(flip_df['Team2_team_score']>flip_df['Team1_team_score'],1,0)
flip_df = flip_df[orig_cols]

In [14]:
target_yes.shape

(1179, 63)

In [15]:
target_no = pd.concat([target_no,flip_df],axis=0)

In [16]:
clean_ind_games2 = pd.concat([target_yes,target_no],axis=0).sample(frac=1).reset_index(drop=True)

In [17]:
clean_ind_games2.to_csv('data/final_data/clean_shuffled_games.csv',index=False)

### Create Scale Model 

In [18]:
scale_df = clean_ind_games2.drop(['Team2_win','Team1_team_score','Team2_team_score'],axis=1).convert_objects(convert_numeric=True)
scale_df = scale_df.fillna(0)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(scale_df)

In [20]:
from sklearn.externals import joblib 
scaler_filename = "models/scaler.sav"
joblib.dump(scaler, scaler_filename) 

['models/scaler.sav']

### Train/Test

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
clean_ind_games2 = clean_ind_games2.drop(['Team1_team_score','Team2_team_score'],axis=1).convert_objects(convert_numeric=True)
clean_ind_games2 = clean_ind_games2.fillna(0)

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """Entry point for launching an IPython kernel.


In [23]:
trainingSet, testSet = train_test_split(clean_ind_games2, test_size=0.3,stratify=None)

In [24]:
trainingSet.to_csv("data/final_data/clean_train.csv",index=False)
testSet.to_csv("data/final_data/clean_test.csv",index=False)

### Prepare Data to search through

In [25]:
search_df = ind_games.drop(['Team1_team_id','Team2_team_id','Team1_team_score','Team2_team_score',
                           'Team2_win'],axis=1)

In [33]:
search_df.shape

(2358, 63)

In [26]:
game_ids = search_df['game_id']

In [27]:
team1_df = search_df[search_df.columns[search_df.columns.str.contains('Team1')].tolist()]
team2_df = search_df[search_df.columns[search_df.columns.str.contains('Team2')].tolist()]

In [28]:
t1_cols_rename = team2_df.columns
t1_cols_rename = [word.replace('2','1') for word in t1_cols_rename]
team2_df.columns = t1_cols_rename

In [29]:
search_df_fin = pd.concat([pd.concat([game_ids,team1_df],axis=1),pd.concat([game_ids,team2_df],axis=1)],axis=0)

In [34]:
search_df_fin.shape

(4716, 32)

In [30]:
search_df_fin.to_csv('data/final_data/team_games_data.csv',index=False)