In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from datetime import datetime
import gc
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('/Users/mr_lurie/1962_3450_bundle_archive/train.csv', encoding = "ISO8859-1")
test_x = pd.read_csv('/Users/mr_lurie/1962_3450_bundle_archive/test.csv', encoding = "ISO8859-1")
test_y = pd.read_csv('/Users/mr_lurie/1962_3450_bundle_archive/test_salaries.csv')

In [3]:
test_y=list(test_y['Salary'].values)

train_x = train.drop('Salary',axis=1)
train_y = list(train['Salary'])

train=[]
gc.collect()

20

In [4]:
# now i am going to use the python datetime functions to take the DOB column and 
# turn it into the age in days

In [5]:
def elapsed_days(start, end=datetime(2016,10,12)):
    """calculate the number of days start and end dates"""
    x = (end - start)
    return x.days

train_x['age_season_start'] = train_x.apply(lambda x: 
    elapsed_days(datetime.strptime(x['Born'], '%y-%m-%d')) ,axis=1)

test_x['age_season_start'] = test_x.apply(lambda x: 
    elapsed_days(datetime.strptime(x['Born'], '%y-%m-%d')) ,axis=1)

In [6]:
# now that we have altered the age column, we can drop the unneeded information. 

In [7]:
drop_cols = ['City', 'Pr/St', 'Cntry', 'Last Name', 'First Name', 'Team', 'Born']

test_x.drop(drop_cols, axis = 1, inplace = True)

train_x.drop(drop_cols, axis = 1, inplace = True)

In [8]:
# now that we have cleaned the data up, we can use parallel data propcessing pipelines to 
# impute the median for missing numerical values, binarize (one-hote encode) each of the 
# categorical columns and merge the numerical an categorical arrays into a single output.

# first we must identify the numerical and categorical columns, then write the DataFrameSelector
# class to pull these columns out of the input and use them in the processing pipelines

In [9]:
train_x.dtypes
for i in train_x.dtypes:
    print(i)
    
cat_attribs = ['Nat', 'Hand', 'Position']

num_attribs = list(train_x.drop(cat_attribs,axis=1).columns)

object
int64
int64
float64
float64
float64
object
object
int64
int64
int64
int64
int64
int64
int64
float64
int64
int64
int64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
int64
float64
float64
int64
float64
float64
int64
int64
float64
float64
float64
float64
float64
float64
float64
float64
int64
float64
float64
float64
int64
int64
int64
int64
float64
float64
float64
float64
int64
int64
float64
float64
float64
float64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
float64
float64
int64
float64
int64
int64
int64
int64
int64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
int64
int64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
int64
float64

In [10]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    """this class will select a subset of columns, pass in the numerical or categorical columns as attribute names to get just those columns for processing"""
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [11]:
# now I am going to use a class to employ the LabelBinarizer() function for multiple categorical
# columns at one. It returns a single binary arrray, and also has the self.classes_variable
# that keeps tracks of which variables are stored in which columns 

In [12]:
class MultiColBinarize(BaseEstimator, TransformerMixin):
    """take a df with multiple categoricals one hot encode them all and return the numpy array"""
    def __intit__(self, alter_df=True):
        self.alter_df = alter_df
    def fit(self, X, y=None):
        """loadthe data in, initiate the binarizer for each column"""
        self.X = X
        self.cols_list = list(self.X.columns)
        self.binarizers = []
        for i in self.cols_list:
            encoder = LabelBinarizer()
            encoder.fit(self.X[i])
            self.binarizers.append(encoder)
        return self
    def transform(self, X):
        """for each of the columns, use the existing binarizer to make new cols"""
        self.X = X
        self.binarized_cols = self.binarizers[0].transform(self.X[self.cols_list[0]])
        self.classes_ = list(self.binarizers[0].classes_)
        for i in range(1, len(self.cols_list)):
            binarized_col = self.binarizers[i].transform(self.X[self.cols_list[i]])
            self.binarized_cols = np.concatenate((self.binarized_cols , binarized_col), axis = 1)
            self.classes_.extend(list(self.binarizers[i].classes_))
        return self.binarized_cols

In [13]:
# now it is time to use the numerical processing and categorical functions on the data subsets
# using the following pipelines

In [14]:
num_pipeline = Pipeline([
                ('selector', DataFrameSelector(num_attribs)),
                ('imputer', SimpleImputer(strategy="median")),
                ('std_scaler', StandardScaler()),
])

In [15]:
cat_pipeline = Pipeline([
                ('selector', DataFrameSelector(cat_attribs)),
                ('label_binarizer', MultiColBinarize()),
])

In [16]:
# the two pipelines are called on the train data, and the output it concatenated into a single array

In [17]:
train_num_processed = num_pipeline.fit_transform(train_x)
train_cat_processed = cat_pipeline.fit_transform(train_x)

train_x_clean = np.concatenate((train_num_processed,train_cat_processed),axis=1)

In [18]:
#The test data is just transformed (not fit!), this is so we impute based on the 
# training data, and so the binarized columns match across the datasets.

In [19]:
test_num_processed = num_pipeline.transform(test_x)
test_cat_processed = cat_pipeline.transform(test_x)

test_x_clean = np.concatenate((test_num_processed,test_cat_processed),axis=1)

In [20]:
# double check that the number of columns are the same for both

In [21]:
train_x_clean.shape

(612, 179)

In [22]:
test_x_clean.shape

(262, 179)

In [23]:
svm_reg = SVR(kernel="linear")


svr_param_grid = [
        {'kernel': ['rbf','linear'], 'C': [1.0, 10., 100., 1000.0],
        'gamma': [0.01, 0.1,1.0]}
    ]


svm_grid_search = GridSearchCV(svm_reg, svr_param_grid, cv=5,
                                scoring='neg_mean_squared_error')

svm_grid_search.fit(train_x_clean, train_y)

svm_grid_search.best_params_

svm_grid_search.best_estimator_

cvres = svm_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

2604638.116815017 {'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}
2583847.3789636404 {'C': 1.0, 'gamma': 0.01, 'kernel': 'linear'}
2604674.5385669847 {'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}
2583847.3789636404 {'C': 1.0, 'gamma': 0.1, 'kernel': 'linear'}
2604674.9600821286 {'C': 1.0, 'gamma': 1.0, 'kernel': 'rbf'}
2583847.3789636404 {'C': 1.0, 'gamma': 1.0, 'kernel': 'linear'}
2604302.047499914 {'C': 10.0, 'gamma': 0.01, 'kernel': 'rbf'}
2446427.358579565 {'C': 10.0, 'gamma': 0.01, 'kernel': 'linear'}
2604666.7505793753 {'C': 10.0, 'gamma': 0.1, 'kernel': 'rbf'}
2446427.358579565 {'C': 10.0, 'gamma': 0.1, 'kernel': 'linear'}
2604671.1476444025 {'C': 10.0, 'gamma': 1.0, 'kernel': 'rbf'}
2446427.358579565 {'C': 10.0, 'gamma': 1.0, 'kernel': 'linear'}
2600944.236543935 {'C': 100.0, 'gamma': 0.01, 'kernel': 'rbf'}
1917235.0625623104 {'C': 100.0, 'gamma': 0.01, 'kernel': 'linear'}
2604588.832106904 {'C': 100.0, 'gamma': 0.1, 'kernel': 'rbf'}
1917235.0625623104 {'C': 100.0, 'gamma': 0.1, 'kernel

In [24]:
#Random forest regression

In [25]:
forest_reg = RandomForestRegressor(random_state=42)

rf_param_grid = [
	{'n_estimators': [3, 10, 30,100,300,1000], 'max_features': [2, 4, 6, 8]},
	{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
rf_grid_search = GridSearchCV(forest_reg, rf_param_grid, cv=5,
						   scoring='neg_mean_squared_error')
rf_grid_search.fit(train_x_clean, train_y)


rf_grid_search.best_params_
rf_grid_search.best_estimator_

cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
	print(np.sqrt(-mean_score), params)


1832876.7138011067 {'max_features': 2, 'n_estimators': 3}
1659347.3868517643 {'max_features': 2, 'n_estimators': 10}
1607524.8652434794 {'max_features': 2, 'n_estimators': 30}
1586809.2120506605 {'max_features': 2, 'n_estimators': 100}
1581623.3700700344 {'max_features': 2, 'n_estimators': 300}
1572933.465961073 {'max_features': 2, 'n_estimators': 1000}
1798965.6693307506 {'max_features': 4, 'n_estimators': 3}
1672613.7100297958 {'max_features': 4, 'n_estimators': 10}
1583496.60584367 {'max_features': 4, 'n_estimators': 30}
1552608.3572376645 {'max_features': 4, 'n_estimators': 100}
1532299.8400930301 {'max_features': 4, 'n_estimators': 300}
1533056.9803372358 {'max_features': 4, 'n_estimators': 1000}
1918953.6477764542 {'max_features': 6, 'n_estimators': 3}
1608523.680968342 {'max_features': 6, 'n_estimators': 10}
1511815.0860214517 {'max_features': 6, 'n_estimators': 30}
1509531.4374952095 {'max_features': 6, 'n_estimators': 100}
1499043.157489585 {'max_features': 6, 'n_estimators': 

In [34]:
XGBoost_reg = xgb.XGBRegressor()

xgb_param_grid  = [{'min_child_weight': [20,25,30], 
                    
                    'learning_rate': [0.1, 0.2, 0.3], 
                    
                    'colsample_bytree': [0.9], 
                    
                    'max_depth': [5,6,7,8], 
                    
                    'reg_lambda': [1.], 
                    
                    'nthread': [-1], 
                    
                    'n_estimators': [100,1000,2000],
                    
                    'early_stopping_rounds':[50],
                    
                    'objective': ['reg:linear']}]


xgb_grid_search = GridSearchCV(XGBoost_reg, xgb_param_grid, cv=5,
                    scoring='neg_mean_squared_error', n_jobs=1)

xgb_grid_search.fit(train_x_clean, train_y)



xgb_grid_search.best_params_

xgb_grid_search.best_estimator_

cvres = xgb_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
	print(np.sqrt(-mean_score), params)











KeyboardInterrupt: 