In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Direct Feature Engineering

In [19]:
hockey_train = pd.read_csv('./data/clean/train.csv'
                     ,encoding = "ISO-8859-1")
hockey_test = pd.read_csv('./data/clean/test.csv'
                          ,encoding = "ISO-8859-1")
hockey_test_y = pd.read_csv('./data/clean/test_salaries.csv'
                          ,encoding = "ISO-8859-1")

In [20]:
def combine_train_and_test(train_df, test_df, test_response):
    test_df = pd.concat([test_df, test_response], axis = 1)
    return pd.concat([train_df, test_df],ignore_index = True, sort = False)

In [21]:
hockey = combine_train_and_test(hockey_train, hockey_test, hockey_test_y)

In [22]:
hockey['Age'] = 117 - pd.to_numeric(hockey['Born'].str[0:2])
# hockey = hockey.drop(['Born', 'City', 'Last Name', 'First Name', 'Cntry'], axis = 1)

def nationality_group(df, nationalityCol):
    # A function to feature engineering the 'Nationality column'
    # Changes it from 16 unique values to 5 to prevent overfitting
    scandanavianNations = ['SWE','NOR','FIN']
    otherNations = ['CHE','CZE','FRA','DEU','SVK','AUT','DNK','LVA','HRV','GBR']
    df.loc[(df[nationalityCol].isin(scandanavianNations)), nationalityCol] = 'Scandanavian'
    df.loc[(df[nationalityCol].isin(otherNations)), nationalityCol] = 'Other'
    return df
hockey = nationality_group(hockey, 'Nat')

In [23]:
# Code used to group and remove provinces and states that are only seen a few times
# Useful to prevent overfitting
prs = hockey.groupby('Pr/St').agg({'Pr/St':['count']}).reset_index()
prs.columns = ['pr/st','count']
extreneousStates = list(prs.loc[(prs['count'] < 10)]['pr/st'])
hockey.loc[(hockey['Pr/St'].isin(extreneousStates)),'Pr/St'] = 'Other'

In [24]:
# Adding isNa Cols
# These columns are useful to account for missing data
def addIsNACol(df, col_name):
    na_col_name = col_name + '_is_na'
    df[na_col_name] = 0
    df.loc[(df[col_name].isna()), na_col_name] = 1
    return df

In [25]:
hockey = addIsNACol(hockey, 'DftYr')

In [26]:
hockey.columns

Index(['Salary', 'Born', 'City', 'Pr/St', 'Cntry', 'Nat', 'Ht', 'Wt', 'DftYr',
       'DftRd',
       ...
       'DPS', 'PS', 'OTOI', 'Grit', 'DAP', 'Pace', 'GS', 'GS/G', 'Age',
       'DftYr_is_na'],
      dtype='object', length=156)

In [28]:
hockey[['DftYr','DftYr_is_na']].head(40)

Unnamed: 0,DftYr,DftYr_is_na
0,2015.0,0
1,2012.0,0
2,2006.0,0
3,2010.0,0
4,2012.0,0
5,1997.0,0
6,2009.0,0
7,,1
8,2010.0,0
9,2011.0,0


# Save Processed Data to be used by Model Pipeline

In [27]:
y = hockey['Salary']
X = hockey.drop('Salary', axis = 1)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [29]:
X_train.to_csv('./data/processed/X_train.csv', index= False)
X_test.to_csv('./data/processed/X_test.csv', index= False)
y_train.to_csv('./data/processed/y_train.csv', index= False)
y_test.to_csv('./data/processed/y_test.csv', index= False)

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
