# Purpose

- this notebook analyzes dataframes to determine how applicable they are for machine learning
- there are many functions for creating and testing different dataframe transformations to apply clasifications to them

# IMPORTS

In [1]:
import pandas as pd
import numpy as np
import random
import sys
import seaborn as sns
import pyodbc
from seaborn import load_dataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
#import warnings
#warnings.simplefilter(action='ignore', category=FutureWarning)

# GRIDSEARCH
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# SCORING
from sklearn.metrics import (confusion_matrix,
                             accuracy_score,
                             mean_squared_error,
                             r2_score)
# TEXT HANDLING
from sklearn.feature_extraction.text import (CountVectorizer,
                                             HashingVectorizer,
                                             TfidfVectorizer)
# ENCODINGS
from sklearn.preprocessing import (Binarizer,
                                   FunctionTransformer,
                                   LabelBinarizer,
                                   PolynomialFeatures,
                                   RobustScaler)
# LINEAR CLASSIFICATIONS
from sklearn.linear_model import (LinearRegression,
                                  Ridge,
                                  Lasso,
                                  ElasticNet,
                                  LogisticRegression)
# NON-LINEAR CLASSIFICATIONS
from sklearn.tree import (DecisionTreeRegressor,
                          DecisionTreeClassifier)
from sklearn.neighbors import (KNeighborsRegressor,
                               KNeighborsClassifier)

# VECTORIZATION FUNCTIONS

In [3]:
# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: vectorized dataframe with the target untouched
# Vectorization = CountVectorizer
def CntVec(df,target):
    # split into X and y datasets
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    # extract numerical and object columns from X dataset
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    X_num = X_init.select_dtypes(include=numerics)
    X_obj = X_init.select_dtypes(include='object')
    # fill an empty dataframe with all the vectorizations of the object columns
    X_vect = pd.DataFrame()
    print('Count vectorizing...')
    for col in X_obj.columns:
        vect = CountVectorizer(binary=True)
        arr = vect.fit_transform(X_obj[col]).toarray()
        dfv = pd.DataFrame(arr)
        X_vect = pd.concat([X_vect, dfv], axis=1, join_axes=[dfv.index])
    # concat the vectorized data and the numeric data
    X_prime = pd.concat([X_vect, X_num], axis=1, join_axes=[X_num.index])
    # drop any NaNs that may have been made (there were few in the landslides vectorization)
    nadrop = pd.concat([X_prime, y_init], axis=1, join_axes=[y_init.index]).dropna()
    print('The vectorized data has shape:',nadrop.shape,'\n')
    return nadrop

# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: vectorized dataframe with the target untouched
# Vectorization = TfidfVectorizer
def TfdVec(df,target):
    # split into X and y datasets
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    # extract numerical and object columns from X dataset
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    X_num = df.select_dtypes(include=numerics)
    X_obj = df.select_dtypes(include='object')
    # fill an empty dataframe with all the vectorizations of the object columns
    X_vect = pd.DataFrame()
    print('Tfidf vectorizing...')
    for col in X_obj.columns:
        vect = TfidfVectorizer()
        arr = vect.fit_transform(X_obj[col].values.astype('U')).toarray()
        df = pd.DataFrame(arr)
        X_vect = pd.concat([X_vect, df], axis=1, join_axes=[df.index]).dropna()
    # concat the vectorized data and the numeric data
    X_prime = pd.concat([X_vect, X_num], axis=1, join_axes=[X_num.index])
    # drop any NaNs that may have been made (there were few in the landslides vectorization)
    nadrop = pd.concat([X_prime, y_init], axis=1, join_axes=[y_init.index]).dropna()
    print('The vectorized data has shape:',nadrop.shape,'\n')
    return nadrop

# ENCODING FUNCTIONS

In [4]:
# INPUT: pandas dataframe
# OUTPUT: dataframe with RobustScaler applied
# Encoding = RobustScaler
def RobScale(df):
    dum = RobustScaler(with_centering=False)
    print('Robust fitting...')
    fit = dum.fit(df)
    print('Robust scaling...')
    df2 = fit.transform(df)
    print('Pandas filling...')
    dfit = pd.DataFrame(df2).dropna()
    print('The scaled data has shape:',dfit.shape,'\n')
    return dfit

print('here 1')

# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: robust scaled and encoded dataframe with the target untouched
# Encoding = Binarizer
def Binz(df, target):
    # split into X and y datasets
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    dum = Binarizer()
    scaled = RobScale(df)
    print('Binarizer fitting...')
    fit = dum.fit(scaled)
    print('Binarizer transforming...')
    dfit = pd.DataFrame(fit.transform(scaled))
    # drop any NaNs that may have been made (there were few in the landslides vectorization)
    dfity = pd.concat([dfit, y_init], axis=1, join_axes=[y_init.index]).dropna()
    print('The encoded data has shape:',dfity.shape,'\n\n')
    return dfity

print('here 2')

# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: robust scaled and encoded dataframe with the target untouched
# Encoding = FunctionTransformer
def FncTran(df, target):
    # split into X and y datasets
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    dum = FunctionTransformer()
    scaled = RobScale(X_init)
    print('Function transformer fitting...')
    fit = dum.fit(scaled)
    print('Function transforming...')
    dfit = pd.DataFrame(fit.transform(scaled))
    # drop any NaNs that may have been made (there were few in the landslides vectorization)
    dfity = pd.concat([dfit, y_init], axis=1, join_axes=[y_init.index]).dropna()
    print('The encoded data has shape:',dfity.shape,'\n\n')
    return dfity

here 1
here 2


# LINEAR CLASSIFICATION FUNCTIONS

In [6]:
### BACKUP LINES FOR ACCURACY SCORE AND CONFUSION
#   acc_score = accuracy_score(y_test, pred.predict(X_test))  
#   conf_matrix = confusion_matrix(y_test, pred.predict(X_test))
#   print('The accuracy score is: \t\t%s'%acc_score)
#   print('The confusion matrix is:',conf_matrix)

# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = LinearRegression
def LinReg(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = LinearRegression()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred
  
# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = LogisticRegression
def LogReg(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = LogisticRegression()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred

# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = Ridge
def RidgeClass(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = Ridge()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred
  
# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = Lasso
def LassoClass(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = Lasso()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred
  
# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = ElasticNet
def ElastNet(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = ElasticNet()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred

# NON-LINEAR CLASSIFICATION FUNCTIONS

In [7]:
# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = DecisionTreeRegressor
def TreeReg(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = DecisionTreeRegressor()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred
  
# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = DecisionTreeClassifier
def TreeClass(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = DecisionTreeClassifier()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred

# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = KNeighborsRegressor
def KNNReg(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = KNeighborsRegressor()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred

# INPUT: pandas dataframe and the string representation of the target column
# OUTPUT: predictive model based on the dataframe
# Model = KNeighborsClassifier
def KNNClass(df,target):
    X_init = df.drop(target, axis=1)
    y_init = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X_init,y_init,train_size=0.7,random_state=42)
    pred = KNeighborsClassifier()
    pred.fit(X_train, y_train)
    msq = mean_squared_error(y_test, pred.predict(X_test))
    r2= r2_score(y_test, pred.predict(X_test))
    print('The mean squared error is: \t\t%s'%msq)
    print('The R2 score is: \t\t\t%s'%r2)
    return pred 

# GETTING THE DATA
- We are going to look at a dataset of landslides
- This could be interesting, or boring. Not sure yet...

In [8]:
## Past Year of YieldPlusQNs for BWI ##
# connecting to the database
cnxn_fttiy = pyodbc.connect('Driver={SQL Server};'
                      'Server=EIM-DB-AG40.NORTHGRUM.COM;'
                      'Database=j20032_yield;'
                      'Trusted_Connection=yes;')

# defining the sql query string
# NOTE: string is surrounded by three double-quotes ("""<text>"""), must exclude """ from query
sql_fttiy = """
SELECT * 
    FROM [j20032_yield].[dbo].[YieldPlusQNs] 
        WHERE [DATE] >= DATEADD(Year, -1, getdate()) 
            AND [OROP_PLNT_ID] = 'P001'
            """
df = pd.read_sql(sql_fttiy,cnxn_fttiy)

# getting rid of any trailing whitespace
for col in df.columns:
    try:
        df[col] = df[col].apply(lambda row: row.strip())
    except:
        pass

# DESCRIPTIVE TOOLS

In [9]:
# # useful descriptive tools
# df.head()
# df.shape
# df.dtypes
# df.describe()

# ANALYSIS TOOLS

In [10]:
# # getting a dictionary with counts of unique elements in each columns
# unq_cnts = []
# for x in df.columns:
#     unq_cnts = unq_cnts + [{'cols':x,'cnts':df[x].unique().shape[0]}]

# # making countplots for each column with less than 50 unique values, counting the number of instances of each variable
# for col in df.columns:
#     if df[col].unique().shape[0] < 50:
#         sns.countplot(data=df, y=col, palette='Blues_r', order=df[col].value_counts().index)
#         plt.show()

# # sorting the dictionary of unique counts
# # columns with the fewest unique values are better targets
# col_unqs = sorted(unq_cnts, key = lambda i: i['cnts'])
# targ_cols = col_unqs[0:5]

# DATA PREPARATION TOOLS

In [None]:
# # dropping nan values
# df = df.dropna(axis=0, subset=['landslide_size'])

# # example for creating dictonary of what to fill nans with in each column
# values = {'admin_division_name': 'unknown', 'admin_division_population': df.admin_division_population.median(), 'country_code': 'UNK', 
#           'country_name': 'NA', 'created_date':'2017-11-20T15:17:00.000', 'event_description':'NA', 
#           'fatality_count':df.fatality_count.median(),'gazeteer_closest_point':'NA', 
#           'gazeteer_distance':df.gazeteer_distance.median(), 'injury_count':df.injury_count.median(), 'landslide_setting':'NA', 'landslide_trigger':'NA',
#           'location_accuracy':'unknown', 'location_description':'NA', 'notes':'NA', 'photo_link':'NA', 'source_link':'NA', 'storm_name':'NA', 'submitted_date':'NA'}
# df = df.fillna(value=values)

# # example for numerizing the target
# mapping = {'small':1, 'medium':2, 'large':3, 'very_large':4, 'catastrophic':5}
# df = df.replace({'landslide_size':mapping})

# BEGIN ANALYZING THE DATA
- the functions for ML tools have been tested and are ready for use
- need to define target before anything

In [None]:
target = 'landslide_size'
fdsfsd

In [None]:
# vectorizations

C_vect = CntVec(df, target)
T_vect = TfdVec(df, target)

In [None]:
# encodings

CntFnc = FncTran(C_vect, target)
TfdFnc = FncTran(T_vect, target)
CntBin = Binz(C_vect, target)
TfdBin = Binz(T_vect, target)

In [None]:
# # classifications

print('\n\n**** MODEL = LINEAR REGRESSION *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncLin = LinReg(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncLin = LinReg(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinLin = LinReg(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinLin = LinReg(TfdBin,target)

print('\n\n**** MODEL = LOGISTIC REGRESSION *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncLog = LogReg(CntFnc,target)
#print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
#TfdFncLog = LogReg(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinLog = LogReg(CntBin,target)
#print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
#TfdBinLog = LogReg(TfdBin,target)

print('\n\n**** MODEL = RIDGE CLASSIFIER *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncRidge = RidgeClass(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncRidge = RidgeClass(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinRidge = RidgeClass(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinRidge = RidgeClass(TfdBin,target)

print('\n\n**** MODEL = LASSO CLASSIFIER *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncLasso = LassoClass(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncLasso = LassoClass(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinLasso = LassoClass(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinLasso = LassoClass(TfdBin,target)

print('\n\n**** MODEL = ELASTIC NET *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncElast = ElastNet(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncElast = ElastNet(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinElast = ElastNet(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinElast = ElastNet(TfdBin,target)

print('\n\n**** MODEL = DECISION TREE REGRESSOR *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncTreeReg = TreeReg(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncTreeReg = TreeReg(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinTreeReg = TreeReg(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinTreeReg = TreeReg(TfdBin,target)

print('\n\n**** MODEL = DECISION TREE CLASSIFIER *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncTreeClass = TreeClass(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncTreeClass = TreeClass(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinTreeClass = TreeClass(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinTreeClass = TreeClass(TfdBin,target)

print('\n\n**** MODEL = KNN REGRESSOR *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncKNNReg = KNNReg(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncKNNReg = KNNReg(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinKNNReg = KNNReg(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinKNNReg = KNNReg(TfdBin,target)

print('\n\n**** MODEL = KNN CLASSIFIER *********************')
print('\n\tFEATURE ENGINEERING = CountVectorizer + FunctionTransformer')
CntFncKNNClass = KNNClass(CntFnc,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + FunctionTransformer')
TfdFncKNNClass = KNNClass(TfdFnc,target)
print('\n\n\tFEATURE ENGINEERING = CountVectorizer + Binarizer')
CntBinKNNClass = KNNClass(CntBin,target)
print('\n\n\tFEATURE ENGINEERING = TfidfVectorizer + Binarizer')
TfdBinKNNClass = KNNClass(TfdBin,target)

# Notes on Classifications
- the r2 score doesn't help us much unless we are talking about linear/logistic regression
- the mean squared error is always helpful, identifying how far our samples are on average from the model

# RESULTS
- based on the mean squared error we can see that:
  - Lasso and ElasticNet models predict with highest accuracy
  - the two vectorizations for these two classifications produce Lasso/ENet models with similar accuracy
  - the Binarizer produces a slightly more accurate Lasso/ENet model than the FunctionTransformer

# GRID SEARCHING
- picking the following combination of methods to include in the pipeline:
  - CountVectorizer
  - Binarizer
  - RobustScaler
  - ElasticNet

In [None]:
# REFRESHING THE DATA SOURCE

import urllib.request, json 
with urllib.request.urlopen("https://data.nasa.gov/resource/tfkf-kniw.json") as url:
    df = pd.DataFrame(json.loads(url.read().decode()))

# what if we implemented the vectorization here?...
for x in df.columns:
  try:
    df[x] = df[x].astype('float')
  except ValueError:
    pass
    
# we'll have to fill nans, remove unknowns in all data columns
df = df.dropna(axis=0, subset=['landslide_size'])[df['landslide_size'] != 'unknown']

# creating dictonary of what to fill nans with in each column
values = {'admin_division_name': 'unknown', 'admin_division_population': df.admin_division_population.median(), 'country_code': 'UNK', 
          'country_name': 'NA', 'created_date':'2017-11-20T15:17:00.000', 'event_description':'NA', 'event_import_id': df.event_import_id.mean(), 
          'event_import_source': 'NA', 'fatality_count':df.fatality_count.median(),'gazeteer_closest_point':'NA', 
          'gazeteer_distance':df.gazeteer_distance.median(), 'injury_count':df.injury_count.median(), 'landslide_setting':'NA', 'landslide_trigger':'NA',
          'location_accuracy':'unknown', 'location_description':'NA', 'notes':'NA', 'photo_link':'NA', 'source_link':'NA', 'storm_name':'NA', 'submitted_date':'NA'}
df = df.fillna(value=values)

# we have to numerize the target
mapping = {'small':1, 'medium':2, 'large':3, 'very_large':4, 'catastrophic':5}
df = df.replace({'landslide_size':mapping})

# dropping unnecessary columns
cols = ['source_link','created_date','submitted_date','photo_link','event_description']
df = df.drop(columns=cols)

# vectorizing the data
df = CntVec(df, 'landslide_size')

In [None]:
target = 'landslide_size'

X = df.drop(target, axis=1).as_matrix()
y = df[target].as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [None]:
pipeline = Pipeline(steps=[
    ('rs', RobustScaler()),
    ('binz', Binarizer()),
    ('en', ElasticNet())])

pipeline.fit(X_train, y_train)

In [None]:
# LITTLE SEARCH

parameters = {'rs__with_centering':[True],
              'rs__with_scaling':[True],
#               'rs__copy':[True,False],
#               'binz__copy':[True,False],
              'binz__threshold':[3.15,3.2,3.25,3.3],
              'en__alpha':[0.0,0.5],
#               'en__l1_ratio':[0.0,0.25],
              'en__fit_intercept':[True,False],
              'en__normalize':[True,False],
#               'en__precompute':[True,False],
#               'en__warm_start':[True,False],
#               'en__positive':[True,False]
             }

search = GridSearchCV(pipeline, parameters, cv=5,n_jobs=-1, verbose=3)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
# # BIG SEARCH

# parameters = {'rs__with_centering':[True],
#               'rs__with_scaling':[True],
#               'rs__copy':[True,False],
#               'binz__copy':[True,False],
#               'binz__threshold':[3.15,3.2,3.25,3.3],
#               'en__alpha':[0.0,0.5],
#               'en__l1_ratio':[0.0,0.25],
#               'en__fit_intercept':[True,False],
#               'en__normalize':[True,False],
#               'en__precompute':[True,False],
#               'en__warm_start':[True,False],
#               'en__positive':[True,False]
#              }

# search = GridSearchCV(pipeline, parameters, cv=5,n_jobs=-1, verbose=3)
# search.fit(X_train, y_train)
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# print(search.best_params_)