________
<a id="top"></a>
# DS 7331 Data Mining: Lab 2 iPython Notebook
Created On: February 11, 2019
### Authors:  
- Arora, Tanvi                
- Chandna, Rajat
- Henderson Kuns, Nicol
- Ramasundaram, Kumar
- Vasquez, James
LRInterpertFeat

# Logisitic Regression and Support Vector Machines

## Contents
* <a href="#DataPrep">Data Prepping</a>
    * <a href="#onehotencode">One Hot Encoding</a>
    * <a href="#Perform8020split">Perform 80/20 split</a>  
    * <a href="#PrepTestData">Prep Test Data</a>    
* <a href="#CreateLRModel">Create Models</a>
    * <a href="#CreateLRModel">Simple Logistic Regression Model</a>  
    * <a href="#LRGridSearch">Grid Search</a>   
    * <a href="#LRInterpertFeat">Feature Interpertation</a>   
* <a href="#SVMModel">Simple SVM Model</a>
    * <a href="#SVMRBF">RBF Grid Search</a>   
    * <a href="#SVMPOLY">Poly Grid Search</a>   
    * <a href="#SVMFINAL">Final SVM Model on Validation Dataset</a>
    * <a href="#SVMFINAL_Test">Final SVM Model on Additional Test Dataset</a> 
* <a href="#MODELADV">Model Advantages</a>
* <a href="#INTVECT">Interpret Support Vector</a>
* <a href="#ECPWORK">Exceptionnal Work</a>

<a id="DataPrep"></a>
### Getting Dataset Ready for Model Building

In [16]:
# Importing the needed modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', FutureWarning)

# To display plots inside the iPython Notebook itself
%matplotlib inline

In [17]:
# To verify how data is orgainzed in file(to find the delimiter) and then
# use corresponding function to open the file. eg
# data could be in .csv. .tsv, excel format etc.
pathOfDataFile = "data/bank-full.csv"
firstFewLines = list()
noOfLinesToView = 5

with open(pathOfDataFile) as dataFile:
    firstFewLines = [next(dataFile) for i in range(noOfLinesToView)]
    for line in firstFewLines:
        print(line)

"age";"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"

58;"management";"married";"tertiary";"no";2143;"yes";"no";"unknown";5;"may";261;1;-1;0;"unknown";"no"

44;"technician";"single";"secondary";"no";29;"yes";"no";"unknown";5;"may";151;1;-1;0;"unknown";"no"

33;"entrepreneur";"married";"secondary";"no";2;"yes";"yes";"unknown";5;"may";76;1;-1;0;"unknown";"no"

47;"blue-collar";"married";"unknown";"no";1506;"yes";"no";"unknown";5;"may";92;1;-1;0;"unknown";"no"



In [18]:
# Import the semi-colon delimited data file into pandas dataFrame
bankPromo_df = pd.read_csv(pathOfDataFile, sep = ";")

# Rename the Target/Final Outcome column from "y" to "Subscribed" as based on data description.
bankPromo_df = bankPromo_df.rename(columns={"y":"Subscribed"})

bankPromo_df.head(7)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Subscribed
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
5,35,management,married,tertiary,no,231,yes,no,unknown,5,may,139,1,-1,0,unknown,no
6,28,management,single,tertiary,no,447,yes,yes,unknown,5,may,217,1,-1,0,unknown,no


In [19]:
bankPromo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age           45211 non-null int64
job           45211 non-null object
marital       45211 non-null object
education     45211 non-null object
default       45211 non-null object
balance       45211 non-null int64
housing       45211 non-null object
loan          45211 non-null object
contact       45211 non-null object
day           45211 non-null int64
month         45211 non-null object
duration      45211 non-null int64
campaign      45211 non-null int64
pdays         45211 non-null int64
previous      45211 non-null int64
poutcome      45211 non-null object
Subscribed    45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [20]:
# Get the unique values(Levels) for categorical variables.
# List to hold names of categorical variables
categoricalVars = list()
# List to hold names of numerical variables
numericalVars = list()

for colName in bankPromo_df.columns:
    if bankPromo_df[colName].dtype == np.int64:
        numericalVars.append(colName)
    elif bankPromo_df[colName].dtype == np.object:
        categoricalVars.append(colName)
    else:
        pass
    
# Remove Target column from final categorical Var list
categoricalVars.remove('Subscribed')

print(numericalVars)
print(categoricalVars)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']


________________________________________________________________________________________________________
<a id="onehotencode"></a>
<a href="#top">Back to Top</a>
### Perform One Hot Encoding for categorical variables in dataset

In [21]:
# Make a copy of original data frame
bankPromoModel_Df = bankPromo_df.copy()
bankPromoModel_Df['Target'] = bankPromoModel_Df['Subscribed'].apply(lambda resp : 1 if resp == "yes" else 0)
bankPromoModel_Df['Target'] = bankPromoModel_Df['Target'].astype(np.int)
# Delete the original 'Subscribed' column
del bankPromoModel_Df['Subscribed']





In [22]:
# Drop the pDays feature as it had high correlation with "previous" feature
del bankPromoModel_Df['pdays']

In [23]:
# Covert all categorical variables to corresponding indicator variables
for categoricalVar in categoricalVars:
    tmpDf = pd.DataFrame()
    # Remove 1st class level to avoid multicollinearity
    tmpDf = pd.get_dummies(bankPromoModel_Df[categoricalVar], prefix=categoricalVar, drop_first=True)
    bankPromoModel_Df = pd.concat((bankPromoModel_Df, tmpDf), axis=1)

# Now remove the original categorical vars since indicator variables are created from them.
bankPromoModel_Df.drop(categoricalVars, inplace=True, axis=1)
bankPromoModel_Df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 42 columns):
age                    45211 non-null int64
balance                45211 non-null int64
day                    45211 non-null int64
duration               45211 non-null int64
campaign               45211 non-null int64
previous               45211 non-null int64
Target                 45211 non-null int64
job_blue-collar        45211 non-null uint8
job_entrepreneur       45211 non-null uint8
job_housemaid          45211 non-null uint8
job_management         45211 non-null uint8
job_retired            45211 non-null uint8
job_self-employed      45211 non-null uint8
job_services           45211 non-null uint8
job_student            45211 non-null uint8
job_technician         45211 non-null uint8
job_unemployed         45211 non-null uint8
job_unknown            45211 non-null uint8
marital_married        45211 non-null uint8
marital_single         45211 non-null uint8
education_s

________________________________________________________________________________________________________
________________________________________________________________________________________________________
<a id="Perform8020split"></a>
<a href="#top">Back to Top</a>
### Create 10 Splits Stratified Cross Validation Object

In [24]:
# Training and Test Split
from sklearn.model_selection import StratifiedShuffleSplit

if 'Target' in bankPromoModel_Df:
    y = bankPromoModel_Df['Target'].values # get the labels we want
    del bankPromoModel_Df['Target']        # get rid of the class label
    X = bankPromoModel_Df.values           # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# To use the cross validation object in scikit learn, we need to grab an instance
# of the object and set it up. This object will be able to split our data into 
# training and testing splits
num_cv_iterations = 10
stratified_cv_object = StratifiedShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2, random_state=999)
                         
print(stratified_cv_object)


StratifiedShuffleSplit(n_splits=10, random_state=999, test_size=0.2,
            train_size=None)


In [25]:
# Training and Test Split
from sklearn.model_selection import StratifiedKFold

if 'Target' in bankPromoModel_Df:
    y = bankPromoModel_Df['Target'].values # get the labels we want
    del bankPromoModel_Df['Target']        # get rid of the class label
    X = bankPromoModel_Df.values           # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# To use the cross validation object in scikit learn, we need to grab an instance
# of the object and set it up. This object will be able to split our data into 
# training and testing splits
num_cv_iterations = 10
stratifiedKfold_cv_object = StratifiedKFold(n_splits=num_cv_iterations, random_state=999)
                         
print(stratifiedKfold_cv_object)


StratifiedKFold(n_splits=10, random_state=999, shuffle=False)


________________________________________________________________________________________________________
<a id="PrepTestData"></a>
<a href="#top">Back to Top</a>
### Getting ready Additional Test Dataset(with 10% instances) for final model fitting and evaluations 

In [26]:
pathOfAdditionalDataFile = "data/bank.csv"

# Import the semi-colon delimited data file into pandas dataFrame
bankPromoAdditional_df = pd.read_csv(pathOfAdditionalDataFile, sep = ";")

# Rename the Target/Final Outcome column from "y" to "Subscribed" as based on data description.
bankPromoAdditional_df = bankPromoAdditional_df.rename(columns={"y":"Subscribed"})

bankPromoAdditional_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
age           4521 non-null int64
job           4521 non-null object
marital       4521 non-null object
education     4521 non-null object
default       4521 non-null object
balance       4521 non-null int64
housing       4521 non-null object
loan          4521 non-null object
contact       4521 non-null object
day           4521 non-null int64
month         4521 non-null object
duration      4521 non-null int64
campaign      4521 non-null int64
pdays         4521 non-null int64
previous      4521 non-null int64
poutcome      4521 non-null object
Subscribed    4521 non-null object
dtypes: int64(7), object(10)
memory usage: 600.5+ KB


In [27]:
bankPromoAdditional_df['Target'] = bankPromoAdditional_df['Subscribed'].apply(lambda resp : 1 if resp == "yes" else 0)
bankPromoAdditional_df['Target'] = bankPromoAdditional_df['Target'].astype(np.int)
# Delete the original 'Subscribed' column
del bankPromoAdditional_df['Subscribed']

In [28]:
# Remove pDays
del bankPromoAdditional_df['pdays']

In [29]:
# Covert all categorical variables to corresponding indicator variables
for categoricalVar in categoricalVars:
    tmpDf = pd.DataFrame()
    # Remove 1st class level to avoid multicollinearity
    tmpDf = pd.get_dummies(bankPromoAdditional_df[categoricalVar], prefix=categoricalVar, drop_first=True)
    bankPromoAdditional_df = pd.concat((bankPromoAdditional_df, tmpDf), axis=1)

# Now remove the original categorical vars since indicator variables are created from them.
bankPromoAdditional_df.drop(categoricalVars, inplace=True, axis=1)

if 'Target' in bankPromoAdditional_df:
    y_Final = bankPromoAdditional_df['Target'].values # get the labels we want
    del bankPromoAdditional_df['Target']        # get rid of the class label
    X_Final = bankPromoAdditional_df.values

bankPromoAdditional_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 41 columns):
age                    4521 non-null int64
balance                4521 non-null int64
day                    4521 non-null int64
duration               4521 non-null int64
campaign               4521 non-null int64
previous               4521 non-null int64
job_blue-collar        4521 non-null uint8
job_entrepreneur       4521 non-null uint8
job_housemaid          4521 non-null uint8
job_management         4521 non-null uint8
job_retired            4521 non-null uint8
job_self-employed      4521 non-null uint8
job_services           4521 non-null uint8
job_student            4521 non-null uint8
job_technician         4521 non-null uint8
job_unemployed         4521 non-null uint8
job_unknown            4521 non-null uint8
marital_married        4521 non-null uint8
marital_single         4521 non-null uint8
education_secondary    4521 non-null uint8
education_tertiary     4521 non-n

________________________________________________________________________________________________________
<a id="CreateLRModel"></a>
<a href="#top">Back to Top</a>
# Create Model


________________________________________________________________________________________________________
<a id="SVMModel"></a>
<a href="#top">Back to Top</a>
### Simple SVM Model Fit

In [30]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


scoring = {'F1_Score': 'f1', 'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score), 'Precision': 'precision', \
          'Recall': 'recall'}

# Standardize the features first, since standardizing the features could lead to
# gradient desent algo to converge faster and then run SVM model

svmModel = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='rbf', degree=3 , gamma='auto', random_state=999))
scores = cross_validate(svmModel, X, y=y, cv=stratified_cv_object, n_jobs=-1, scoring=scoring)

print()
display(pd.DataFrame(scores))

scores = cross_validate(svmModel, X, y=y, cv=stratifiedKfold_cv_object, n_jobs=-1, scoring=scoring)
display(pd.DataFrame(scores))

Unnamed: 0,fit_time,score_time,test_F1_Score,train_F1_Score,test_AUC,train_AUC,test_Accuracy,train_Accuracy,test_Precision,train_Precision,test_Recall,train_Recall
0,69.155095,34.151568,0.442489,0.536374,0.905848,0.943094,0.901913,0.917358,0.660413,0.780235,0.332703,0.40865
1,68.047714,34.066402,0.409762,0.533934,0.908474,0.942348,0.898374,0.917026,0.639279,0.778533,0.301512,0.406287
2,71.070259,34.972438,0.460468,0.522341,0.91084,0.942594,0.905673,0.916058,0.695985,0.781176,0.344045,0.392342
3,68.945323,36.13771,0.442467,0.534931,0.903748,0.942832,0.903019,0.917358,0.675728,0.782787,0.328922,0.406287
4,68.218162,35.670571,0.447059,0.526694,0.905101,0.943542,0.90125,0.916169,0.648115,0.775632,0.34121,0.398724
5,72.355329,35.120178,0.44345,0.525689,0.911227,0.940267,0.903682,0.91628,0.684418,0.779378,0.327977,0.396597
6,68.841946,35.751309,0.434286,0.535808,0.900787,0.944734,0.901471,0.917026,0.661509,0.775291,0.323251,0.409359
7,68.889043,34.649703,0.431423,0.527187,0.909024,0.941691,0.902355,0.916335,0.676768,0.777778,0.316635,0.398724
8,41.310634,24.69216,0.458831,0.529183,0.901194,0.943467,0.904788,0.916363,0.684803,0.77484,0.344991,0.401796
9,45.715505,25.757265,0.434069,0.529936,0.901098,0.942322,0.900807,0.91686,0.652751,0.782548,0.325142,0.400615


Unnamed: 0,fit_time,score_time,test_F1_Score,train_F1_Score,test_AUC,train_AUC,test_Accuracy,train_Accuracy,test_Precision,train_Precision,test_Recall,train_Recall
0,82.682091,19.503515,0.026119,0.540348,0.88924,0.944899,0.884564,0.917545,1.0,0.776684,0.013233,0.414286
1,102.079305,19.230631,0.007308,0.546727,0.370369,0.942571,0.81977,0.918823,0.010274,0.788287,0.005671,0.418487
2,89.725657,20.18996,0.05169,0.559631,0.455206,0.945343,0.788985,0.920324,0.054507,0.791699,0.049149,0.432773
3,84.717636,19.972225,0.050304,0.56984,0.393581,0.947252,0.757797,0.922045,0.046474,0.803749,0.05482,0.441387
4,91.924017,19.718217,0.144304,0.555252,0.580249,0.942971,0.850476,0.920079,0.218391,0.795455,0.10775,0.426471
5,87.035394,20.133172,0.197415,0.559253,0.503552,0.94469,0.848927,0.919931,0.26087,0.785334,0.15879,0.434244
6,86.846771,20.512317,0.166983,0.553717,0.575662,0.946234,0.805795,0.919145,0.167619,0.781394,0.166352,0.428782
7,86.29685,17.631637,0.032051,0.618125,0.208879,0.955003,0.398806,0.92782,0.019746,0.810986,0.085066,0.49937
8,62.521152,13.787464,0.260456,0.543772,0.609869,0.942091,0.827914,0.918801,0.26195,0.793312,0.258979,0.413655
9,45.567088,11.538848,0.292779,0.660222,0.723169,0.954828,0.525442,0.935219,0.177246,0.854521,0.840909,0.537912


In [31]:
# For class balance

svmModel = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='rbf', degree=3 , gamma='auto',class_weight="balanced", random_state=999))

scores = cross_validate(svmModel, X, y=y, cv=stratified_cv_object, n_jobs=-1, scoring=scoring)

display(pd.DataFrame(scores))

scores = cross_validate(svmModel, X, y=y, cv=stratifiedKfold_cv_object, n_jobs=-1, scoring=scoring)
display(pd.DataFrame(scores))

Unnamed: 0,fit_time,score_time,test_F1_Score,train_F1_Score,test_AUC,train_AUC,test_Accuracy,train_Accuracy,test_Precision,train_Precision,test_Recall,train_Recall
0,139.271351,57.901207,0.565231,0.618541,0.917354,0.951077,0.842641,0.865295,0.417607,0.462475,0.874291,0.933585
1,126.17606,58.790069,0.574226,0.61554,0.918856,0.950382,0.850934,0.86347,0.431214,0.458957,0.859168,0.934294
2,125.51022,58.099394,0.56345,0.614334,0.923069,0.94996,0.84441,0.86253,0.4194,0.457222,0.858223,0.935949
3,140.566196,56.779839,0.569612,0.616824,0.922687,0.949735,0.850271,0.864106,0.429119,0.460214,0.846881,0.935004
4,133.407483,59.049373,0.557196,0.617228,0.916606,0.950806,0.840761,0.863996,0.412944,0.460093,0.856333,0.937367
5,140.112008,58.83401,0.575558,0.616045,0.926412,0.949458,0.850603,0.863968,0.431059,0.459863,0.865784,0.932876
6,139.358935,55.750773,0.565765,0.621253,0.912692,0.951827,0.847396,0.866567,0.424057,0.465045,0.849716,0.935476
7,125.869808,57.201792,0.569221,0.616609,0.922874,0.949332,0.845848,0.864189,0.422865,0.460319,0.87051,0.933585
8,84.493445,38.988364,0.567192,0.617201,0.918168,0.950508,0.84828,0.863775,0.425663,0.459722,0.849716,0.938785
9,76.965573,39.113493,0.561512,0.619668,0.919971,0.950812,0.843525,0.865904,0.417704,0.463678,0.856333,0.933822


Unnamed: 0,fit_time,score_time,test_F1_Score,train_F1_Score,test_AUC,train_AUC,test_Accuracy,train_Accuracy,test_Precision,train_Precision,test_Recall,train_Recall
0,157.591276,33.42626,0.178082,0.616786,0.946716,0.951429,0.893852,0.86355,0.945455,0.459293,0.098299,0.938655
1,161.13128,33.369392,0.307592,0.619467,0.712601,0.951644,0.766033,0.866057,0.235235,0.46392,0.444234,0.931933
2,171.182007,31.114848,0.113909,0.625308,0.390318,0.953338,0.673081,0.869378,0.083406,0.470557,0.179584,0.931723
3,152.998072,32.387388,0.262554,0.624333,0.608382,0.954546,0.587481,0.86842,0.166,0.46871,0.627599,0.934664
4,157.229172,31.904568,0.297,0.621554,0.701712,0.953522,0.689007,0.866405,0.201903,0.464806,0.561437,0.937815
5,172.406284,31.522299,0.254057,0.623123,0.605245,0.952728,0.644105,0.867412,0.168305,0.466771,0.517958,0.936975
6,173.362316,31.588564,0.245283,0.622228,0.628616,0.9528,0.575315,0.866429,0.154839,0.464942,0.589792,0.940336
7,135.526077,27.819915,0.109536,0.649156,0.256386,0.960526,0.291528,0.880511,0.064211,0.494394,0.372401,0.944958
8,121.960758,22.050131,0.380363,0.6138,0.792081,0.951385,0.690113,0.862718,0.248268,0.45744,0.812854,0.932563
9,91.075434,19.310992,0.266058,0.686054,0.68505,0.963673,0.395796,0.899978,0.155027,0.542119,0.9375,0.934047


________________________________________________________________________________________________________
<a id="SVMRBF"></a>
<a href="#top">Back to Top</a>
### Tuning The Model Hyper Parameters for SVM Using Grid Search


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

param_grid = {
     'svc__kernel' : ['poly', 'rbf'],
    'svc__C' : np.logspace(-10, 2, 5),
    'svc__degree' : [1,2,3],
    'svc__gamma': np.logspace(-9, 3, 5)}


# Create grid search object

grid = GridSearchCV(make_pipeline(StandardScaler(), SVC(class_weight='balanced', random_state=999)), \
                   param_grid = param_grid, cv = stratified_cv_object, \
                   verbose=False, n_jobs=-1, scoring=scoring, refit='F1_Score', \
                   return_train_score=True)

grid.fit(X, y=y)


print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Note: Grid search for finding optimal parameters for this SVM model took long time and we lost the output of the cell during merge process in github. Hence, we are are adding a snapshot of iteration result that we used further to build our better model.

![alt text](https://github.com/nhendersonkuns/awesomedataminers/raw/master/ReferenceMaterial/RBFGridSearchResults.PNG "RBF GRid Search Results")

________________________________________________________________________________________________________
<a id="SVMPOLY"></a>
<a href="#top">Back to Top</a>
### Tuning The Model Hyper Parameters for SVM Using Grid Search
### For poly kernel


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score

# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

param_grid = {
     'svc__kernel' : ['poly'],
    'svc__C' : [1, 5, 10],
    'svc__degree' : [3],
    'svc__class_weight' : ['balanced'],
    'svc__gamma': [0.01, 1,'auto']}


scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}

# Create grid search object

clf = GridSearchCV(make_pipeline(StandardScaler(), SVC(random_state=999)), \
                   param_grid = param_grid, cv = 3, verbose=False, n_jobs=-1, scoring=scoring, refit='AUC', \
                   return_train_score=True)

# Fit on data

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    X_test = X[test_indices]
    y_test = y[test_indices]
       
    # train the reusable logisitc regression model on the training data
    clf.fit(X_train,y_train)  # train object
    y_hat = clf.predict(X_test) # get test set precitions

    # now let's get the accuracy and confusion matrix for this iterations of training/testing
    tmpDict = dict()
    tmpDict['Accuracy'] = mt.accuracy_score(y_test,y_hat)
    tmpDict['AuC'] = mt.roc_auc_score(y_test,y_hat)
    tmpDict['F1 Score'] = mt.f1_score(y_test,y_hat)
    tmpDict['Precision'] = mt.precision_score(y_test,y_hat)
    tmpDict['Recall'] = mt.recall_score(y_test,y_hat)
    
    modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

    conf = mt.confusion_matrix(y_test,y_hat)
  
    print("====Iteration",iter_num," ====")
    print("confusion matrix\n",conf)
    print("Best Estimator Model Parameters\n", clf.best_params_)

display(modelPerformanceMetrices)
print("Average Model Performnace Metrices ")
display(modelPerformanceMetrices.mean())

________________________________________________________________________________________________________
<a id="SVMFINAL"></a>
<a href="#top">Back to Top</a>
### Final SVM Model after GridSearch on Validation Dataset

In [None]:
from sklearn.svm import SVC
# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

# Standardize the features first, since standardizing the features could lead to
# gradient desent algo to converge faster and then run SVM model

svmModel = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='rbf', degree=3 , gamma=0.01, class_weight = 'balanced', random_state=999))

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    
    
    # train the reusable logisitc regression model on the training data
    svmModel.fit(X_train, y_train)  # train object
    y_hat = svmModel.predict(X_test) # get test set precitions

    # now let's get the accuracy,precision,recall,auc,F1 and confusion matrix for this iterations of training/testing
    tmpDict = dict()
    tmpDict['Accuracy'] = mt.accuracy_score(y_test,y_hat)
    tmpDict['AuC'] = mt.roc_auc_score(y_test,y_hat)
    tmpDict['F1 Score'] = mt.f1_score(y_test,y_hat)
    tmpDict['Precision'] = mt.precision_score(y_test,y_hat)
    tmpDict['Recall'] = mt.recall_score(y_test,y_hat)
    
    modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

    conf = mt.confusion_matrix(y_test,y_hat)
    svm_model = svmModel.named_steps['svc']
    print("====Iteration",iter_num," ====")
    print("confusion matrix\n",conf)
    #print("Best Estimator Model Parameters\n", svm_model.best_params_)

display(modelPerformanceMetrices)
print("Average Model Performnace Metrices ")
display(modelPerformanceMetrices.mean())

- **As we can see from above performance metrics this model has performed better as compared to previous model in that the overall average AuC has improved from 0.65 to 0.84. F1 score improved from 0.44 to 0.56. Also, recall has improved significantly from 0.35 to 0.87. That said, overall accuracy decreased a little from 0.90 to 0.84 and precision drop by about 0.20. But since, overall AuC and F1 improved, by changing Prior probabilities, a balance between pricision and recall could be achieved. Moreover, in reality probabilty of people subscribing to a term deposit is not 50-50. Also, increased false positive rate means that bank would be making some extra calls that would not yield positive result but since recall is high, model is less likely to miss people who actually want to subscribe to bank deposit.**

- Even though Grid Search provided the optimum value, it took very long time to search the grid and provide optimum value. It took approx 4 hours for one iteration.

________________________________________________________________________________________________________
<a id="SVMFINAL_Test"></a>
<a href="#top">Back to Top</a>
### Final SVM Model after GridSearch on Additional Test Dataset

In [None]:
# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

# Standardize the features first, since standardizing the features could lead to
# gradient desent algo to converge faster and then run logistic regression model


svmModelAdd = make_pipeline(StandardScaler(), SVC(C=1.0, kernel='rbf', degree=3 , gamma=0.01, class_weight = 'balanced', random_state=999))

# Fit the whole training dataset now, since validation would be done on additional dataset
svmModelAdd.fit(X,y) 
y_hat = svmModelAdd.predict(X_Final) # get test set precitions

# now let's get the accuracy,precision,recall,auc,F1 and confusion matrix for this iterations of training/testing
tmpDict = dict()
tmpDict['Accuracy'] = mt.accuracy_score(y_Final,y_hat)
tmpDict['AuC'] = mt.roc_auc_score(y_Final,y_hat)
tmpDict['F1 Score'] = mt.f1_score(y_Final,y_hat)
tmpDict['Precision'] = mt.precision_score(y_Final,y_hat)
tmpDict['Recall'] = mt.recall_score(y_Final,y_hat)
    
modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

conf = mt.confusion_matrix(y_Final,y_hat)
  
print("confusion matrix\n",conf)
    
print("\n Model Performnace Metrices")
display(modelPerformanceMetrices)


**In conclusion, the results obtained for classification performance metrices(AuC, accuracy, F1 score etc) on the additional test dataset are very close to those obtained on cross-validation dataset. And final model is performing significantly better(in terms of overall AuC values) than the initial svm model the one without hyperparameters tuning.** 

________________________________________________________________________________________________________
<a id="MODELADV"></a>
<a href="#top">Back to Top</a>
# Model Advantages

In SVM, finding the optimum hyper parameters took very long time. For poly and rbf kernels, it took hours to find the best parameter. 

Once the best parameters were found for each kernels, then individual models ran fast.

Both Support Vector Machine (SVM) and Logistic Regression (LR) try to classify a binary response or maximizing the probability of classifying a response variable.  In these models the team is trying to classify a response variable of subscribing to a long-term deposit.

The SVM model attempts to find the maximum margin in the dimensional space.  This space is defined by the number of features that classify data points.  Points that fall on either side of this plane are classified into one of the binary responses the model is predicting.  To accurately define these margins the model uses support vectors that define the boundaries or the max margin.  These support vectors are points that lie the closest to the calculated boundary of the points.  These support vectors also define the position of the optimized plane.

![alt text](https://github.com/nhendersonkuns/awesomedataminers/raw/master/ReferenceMaterial/SVM_optimal_plane.PNG "SVM Planes")

The LR model will take the output of you model and give you a response that is between 0 and 1 using the logistic function.  In the case that the output is higher or lower than your threshold the value will be give a 1 or 0 respectively.  Within LR the method is to maximize the likelihood that a random data point is classified correctly (maximum likelihood estimation, MLE).  The LR allows the model to optimize this function by use of algorithms such as Newton’s method, conjugate gradients, modifications of Newton’s method using box constraints.  Learning and applying the different methods allow for a more accurate model.

The difference between SVM and LR in terms of the Loss function is that SVM will minimize hinge loss while LR minimizes logistic loss.  What this leads to is logistic loss (LR) diverges faster than hinge loss (SVM).  The LR model is more sensitive to outliers as it tries to find the plane for classifying the points, where as the SVM model is not as sensitive to these outliers.

![alt text](https://github.com/nhendersonkuns/awesomedataminers/raw/master/ReferenceMaterial/Outliers_SVM_vs_LR.PNG "SVM vs LR Outliers")

Within the LR model your values are predicted probabilities between 0 and 1, which at this point you must decide your cut-off to give you the binary response.  The SVM model does produce a final binary response of 0 or 1.

LR will perform better on smaller data sets and SVM performs better with larger data sets.  In terms of small data sets SVM as support vectors may not be a true/good representation of the decision boundaries.  Deciding the performance between SVM and LR, is relative and depends on several factors such as data set size and domain knowledge of the data.  In most cases a simple model (LR) should be attempted first to determine if the output is of desired accuracy.  In tradition LR it is found that the models may fit the training data set to well and may result in a model that is unable to make reasonably good predictions for unknown data points which is referred to as overfitting.  SVM attempts to minimize the classification error on the training set and minimize the complexity of the model.  Over all between the two methods SVM can compute more complex decision boundaries.

**In the case of the bank data set, the team believes that the SVM should provide better results due to the size of the data sets and several outlier points that were discovered during data discovery.   Ultimately, the SVM model performed just as well as the logistic model in our case.  However, the major difference between the two models was the time for the SVM to run and the SVM model's benefit did not outweigh the time that was spent for it to run.**


________________________________________________________________________________________________________
<a id="INTVECT"></a>
<a href="#top">Back to Top</a>
# Interpret Support Vector

In [None]:
svm_model = svmModel.named_steps['svc']

In [None]:
# look at the support vectors
print(svm_model.support_vectors_.shape)
print(svm_model.support_.shape)
print(svm_model.n_support_ )

There are in total 13577 support vectors.

The total number of different support vectors are 13577.The total number postive support vecotrs for those that subscribed are 
11695.The total number of negative support vectors for those that did not subscribe are 1822.

In [None]:
# Now let's do some different analysis with the SVM and look at the instances that were chosen as support vectors

# now lets look at the support for the vectors and see if we they are indicative of anything
# grabe the rows that were selected as support vectors (these are usually instances that are hard to classify)

# make a dataframe of the training data
df_tested_on = bankPromoModel_Df.iloc[train_indices] # saved from above, the indices chosen for training
# now get the support vectors from the trained model
df_support = df_tested_on.iloc[svm_model.support_,:]

df_support['Target'] = y[svm_model.support_] # add back in the 'Survived' Column to the pandas dataframe
bankPromoModel_Df['Target'] = y # also add it back in for the original data
df_support.info()

In [None]:
# now lets see the statistics of these attributes
#from pandas.tools.plotting import boxplot

# group the original data and the support vectors
df_grouped_support = df_support.groupby(['Target'])
df_grouped = bankPromoModel_Df.groupby(['Target'])

# plot KDE of Different variables
vars_to_plot = ['age', 'balance', 'duration','previous']

for v in vars_to_plot:
    plt.figure(figsize=(10,4))
    # plot support vector stats
    plt.subplot(1,2,1)
    ax = df_grouped_support[v].plot.kde() 
    plt.legend(['Not Subscribed','Subscribed'])
    plt.title(v+' (Instances chosen as Support Vectors)')
    
    # plot original distributions
    plt.subplot(1,2,2)
    ax = df_grouped[v].plot.kde() 
    plt.legend(['Not Subscribed','Subscribed'])
    plt.title(v+' (Original)')

Based on the visual examination of the plots, the original data and support vectors look very close. Actually the separation is lot greater in the original data than what is shown in the plots. The reason is that the support vector instances are the data points that are only on the edge of the class boundary and classified incorrectly.

________________________________________________________________________________________________________
<a id="ECPWORK"></a>
<a href="#top">Back to Top</a>
# Exceptional Work
## Running Polynomial Logistic Regression 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
from sklearn.pipeline import make_pipeline
from sklearn.exceptions import DataConversionWarning
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])

# Standardize the features first, since standardizing the features could lead to
# gradient desent algo to converge faster and then run logistic regression model

logisticModel = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), LogisticRegression(penalty='l2', C=1.0, class_weight=None, random_state=999))

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
       
    # train the reusable logisitc regression model on the training data
    logisticModel.fit(X_train,y_train)  # train object
    y_hat = logisticModel.predict(X_test) # get test set precitions

    # now let's get the accuracy,precision,recall,auc,F1 and confusion matrix for this iterations of training/testing
    tmpDict = dict()
    tmpDict['Accuracy'] = mt.accuracy_score(y_test,y_hat)
    tmpDict['AuC'] = mt.roc_auc_score(y_test,y_hat)
    tmpDict['F1 Score'] = mt.f1_score(y_test,y_hat)
    tmpDict['Precision'] = mt.precision_score(y_test,y_hat)
    tmpDict['Recall'] = mt.recall_score(y_test,y_hat)
    
    modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

    conf = mt.confusion_matrix(y_test,y_hat)
  
    print("====Iteration",iter_num," ====")
    print("confusion matrix\n",conf)

display(modelPerformanceMetrices)
print("Average Model Performnace Metrices ")
display(modelPerformanceMetrices.mean())

As seen from output above, we see that polynomial model(having higher complexity than earlier model) is having much better AuC, Precision and recall scores and at higher accuracy, even without hyper parameters tuning. This points to the fact that our earlier model was underfitting and we need to add complexity to it to achieve better results.
Next we would perform grid search to tune hyper parameters of this polynomial logistic regression model.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.simplefilter('ignore', UserWarning)

# Initialize an Empty Dataframe to store Model performance Stats
modelPerformanceMetrices = pd.DataFrame(columns=['Accuracy', 'AuC', 'F1 Score', 'Precision', 'Recall'])


param_grid = {
     'logisticregression__penalty' : ['l1', 'l2'],
    'logisticregression__C' : np.logspace(-4, 4, 50),
    'logisticregression__solver' : ['liblinear'],
    'logisticregression__class_weight' : [None, 'balanced']}


# Create grid search object
# Trying to find params that lead to maximum F1 Score
clf = GridSearchCV(make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), LogisticRegression(random_state=999)), \
                   param_grid = param_grid, cv = 5, verbose=False, n_jobs=-1, scoring='f1')

# Fit on data

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
       
    # train the reusable logisitc regression model on the training data
    clf.fit(X_train,y_train)  # train object
    y_hat = clf.predict(X_test) # get test set precitions

    # now let's get the accuracy and confusion matrix for this iterations of training/testing
    tmpDict = dict()
    tmpDict['Accuracy'] = mt.accuracy_score(y_test,y_hat)
    tmpDict['AuC'] = mt.roc_auc_score(y_test,y_hat)
    tmpDict['F1 Score'] = mt.f1_score(y_test,y_hat)
    tmpDict['Precision'] = mt.precision_score(y_test,y_hat)
    tmpDict['Recall'] = mt.recall_score(y_test,y_hat)
    
    modelPerformanceMetrices = modelPerformanceMetrices.append(tmpDict, ignore_index=True)

    conf = mt.confusion_matrix(y_test,y_hat)
  
    print("====Iteration",iter_num," ====")
    print("confusion matrix\n",conf)
    print("Best Estimator Model Parameters\n", clf.best_params_)

display(modelPerformanceMetrices)
print("Average Model Performnace Metrices ")
display(modelPerformanceMetrices.mean())

### Running Recursive Feature Elimination to Reduce Curse of Dimensionality

Run the RFE in CV mode, in 5 Fold CV there would be 5 passes and in each pass features that are not that important to classification task are listed as output.
Remove a feature from the dataset, if the feature is marked for elimination by RFE in 2 passes out of 5.
Recreate the dataset with only non-eliminated features and fit the model again on this dataset.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt
from sklearn.exceptions import DataConversionWarning
from sklearn.feature_selection import RFE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

featureSelectionArr = np.full((len(bankPromoModel_Df.columns)), True)

# Standardize the features first, since standardizing the features could lead to
# gradient desent algo to converge faster and then run logistic regression model
scl_obj = StandardScaler()
lr_clf = LogisticRegression(penalty='l2', C=1.0, class_weight=None)

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
    # Standardize the train dataset and then apply the transform the test data. 
    scl_obj.fit(X_train)
    X_train_scaled = scl_obj.transform(X_train)
    X_test_scaled = scl_obj.transform(X_test)
    
    # train the reusable logisitc regression model on the training data
#    lr_clf.fit(X_train_scaled,y_train)  # train object

    rfe = RFE(lr_clf, 25)
    fit = rfe.fit(X_train_scaled,y_train)  # train object

    featureSelectionArr = np.vstack((featureSelectionArr, fit.support_ ))
    


In [None]:
# remove the first row in feature array as it was dummy row.
featureSelectionArr = featureSelectionArr[1: , :]

In [None]:
# Keep only those feature that appear important in 3 out of 5 passes.
bankPromoModel_Df.columns[np.sum(featureSelectionArr, axis = 0) >= 3]

In [None]:
# Make a copy of original data frame
bankPromoModel_Df_1 = bankPromo_df.copy()
bankPromoModel_Df_1['Target'] = bankPromoModel_Df_1['Subscribed'].apply(lambda resp : 1 if resp == "yes" else 0)
bankPromoModel_Df_1['Target'] = bankPromoModel_Df_1['Target'].astype(np.int)
# Delete the original 'Subscribed' column
del bankPromoModel_Df_1['Subscribed']

In [None]:
# Covert all categorical variables to corresponding indicator variables
categoricalVarsToBeUsed = ['job',
 'marital',
 'education',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

for categoricalVar in categoricalVarsToBeUsed:
    tmpDf = pd.DataFrame()
    # Remove 1st class level to avoid multicollinearity
    tmpDf = pd.get_dummies(bankPromoModel_Df_1[categoricalVar], prefix=categoricalVar, drop_first=True)
    bankPromoModel_Df_1 = pd.concat((bankPromoModel_Df_1, tmpDf), axis=1)

# Now remove the original categorical vars since indicator variables are created from them.
bankPromoModel_Df_1.drop(categoricalVars, inplace=True, axis=1)

# Now remove non useful numerical variables
bankPromoModel_Df_1.drop(['age', 'balance', 'pdays', 'previous'], inplace=True, axis=1)
bankPromoModel_Df_1.info()

In [None]:
# Training and Test Split
from sklearn.model_selection import ShuffleSplit

if 'Target' in bankPromoModel_Df_1:
    y = bankPromoModel_Df_1['Target'].values # get the labels we want
    del bankPromoModel_Df_1['Target']        # get rid of the class label
    X = bankPromoModel_Df_1.values           # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# To use the cross validation object in scikit learn, we need to grab an instance
# of the object and set it up. This object will be able to split our data into 
# training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

In [None]:
# Fitting the model on reduced dataset
from sklearn.model_selection import validation_curve
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

degrees = np.arange(1, 2)

model = make_pipeline(StandardScaler(),LogisticRegression(penalty='l2', C=1.0, class_weight=None))

for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(X,y)):
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
       
    # train the reusable logisitc regression model on the training data
    model.fit(X_train,y_train)  # train object
    y_hat = model.predict(X_test) # get test set precitions

    # now let's get the accuracy and confusion matrix for this iterations of training/testing
    acc = mt.accuracy_score(y_test,y_hat)
    pre = mt.precision_score(y_test,y_hat)
    rec = mt.recall_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    auc = mt.roc_auc_score(y_test,y_hat)
    print("====Iteration",iter_num," ====")
    print("AuC", auc )
    print("accuracy", acc )
    print("precision", pre)
    print("recall", rec)
    print("confusion matrix\n",conf)

As we can see from above output that even after performing recursive feature elimination(RFE), our model performance metrics are not very different from the that of initial simple logistic regression model. Hence, we would be keeping all features in model and rely upon regularization to prevent our model from overfitting. 