# Feature Selection

Objective: perform exhaustive feature selection 
- Perform statistical tests on features. Examine distributions and variance within the feature, and distance between features
- Examine different scaling approaches (standardizing, centering, log transformations, etc.)
- Use spark_sklearn or spark directly for computationally challenging iterations
- Use the best model pipeline found in the campaign_targeting_model_BATCH notebook

In [19]:
import findspark
findspark.init('/Users/pauldefusco/Documents/spark-2.3.0-bin-hadoop2.7')

In [22]:
from pyspark import SparkContext
sc = SparkContext(master="local[4]")
sc

In [23]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import GradientBoostingClassifier
from spark_sklearn import GridSearchCV

In [32]:
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [None]:
from pyspark.ml import Pipeline
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.ml.feature import StandardScaler, Binarizer

In [None]:
X_train = pd.read_csv('data/X_train')
X_test = pd.read_csv('data/X_test')
y_train = pd.read_csv('data/y_train')
y_test = pd.read_csv('data/y_test')

In [34]:
sqlContext = SQLContext(sc)

In [35]:
X_train_spark = sqlContext.createDataFrame(X_train)
X_test_spark = sqlContext.createDataFrame(X_test)
y_train_spark = sqlContext.createDataFrame(y_train)
y_test_spark = sqlContext.createDataFrame(y_test)

In [36]:
X_train_spark.head(2)

Row(age=30, campaign=1, pdays=999, previous=0, emp.var.rate=-1.8, cons.price.idx=92.89299999999999, cons.conf.idx=-46.2, euribor3m=1.2990000000000002, nr.employed=5099.1, month_total_days=31, is_holiday=0, biz_payday=0, govt_payday=0, poutcome_nonexistent=1, poutcome_success=0, job_blue-collar=1, job_entrepreneur=0, job_housemaid=0, job_management=0, job_retired=0, job_self-employed=0, job_services=0, job_student=0, job_technician=0, job_unemployed=0, job_unknown=0, marital_married=1, marital_single=0, marital_unknown=0, education_basic.6y=1, education_basic.9y=0, education_high.school=0, education_illiterate=0, education_professional.course=0, education_university.degree=0, education_unknown=0, default_unknown=1, default_yes=0, housing_unknown=0, housing_yes=1, loan_unknown=0, loan_yes=0, month_last_contact_aug=0, month_last_contact_dec=0, month_last_contact_jul=0, month_last_contact_jun=0, month_last_contact_mar=0, month_last_contact_may=1, month_last_contact_nov=0, month_last_contac

In [37]:
X_train_spark.printSchema()

root
 |-- age: long (nullable = true)
 |-- campaign: long (nullable = true)
 |-- pdays: long (nullable = true)
 |-- previous: long (nullable = true)
 |-- emp.var.rate: double (nullable = true)
 |-- cons.price.idx: double (nullable = true)
 |-- cons.conf.idx: double (nullable = true)
 |-- euribor3m: double (nullable = true)
 |-- nr.employed: double (nullable = true)
 |-- month_total_days: long (nullable = true)
 |-- is_holiday: long (nullable = true)
 |-- biz_payday: long (nullable = true)
 |-- govt_payday: long (nullable = true)
 |-- poutcome_nonexistent: long (nullable = true)
 |-- poutcome_success: long (nullable = true)
 |-- job_blue-collar: long (nullable = true)
 |-- job_entrepreneur: long (nullable = true)
 |-- job_housemaid: long (nullable = true)
 |-- job_management: long (nullable = true)
 |-- job_retired: long (nullable = true)
 |-- job_self-employed: long (nullable = true)
 |-- job_services: long (nullable = true)
 |-- job_student: long (nullable = true)
 |-- job_technician:

In [38]:
#X_train_spark.registerTempTable("X_train")
#X_test_spark.registerTempTable("X_test")
#X_train_spark.registerTempTable("y_train")
#X_test_spark.registerTempTable("y_test")

In [None]:
#recreating preprocessing pipeline

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

In [None]:
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

In [None]:
#Recreating model
"""
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.5, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)
              """

In [None]:
model = GradientBoostedTrees.trainClassifier(X_train_spark, {}, loss = 'logLoss',
                                             maxDepth = 3,
                                             maxBins = 6, #might need to change this
                                             numIterations=10)

In [None]:
model.numTrees()

In [None]:
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

In [2]:
#sc.stop()