In [1]:
! pip install -U plotly

Collecting plotly
  Downloading plotly-2.0.8.tar.gz (964kB)
[K    100% |████████████████████████████████| 972kB 868kB/s ta 0:00:01    56% |██████████████████              | 542kB 1.4MB/s eta 0:00:01
[?25hCollecting decorator (from plotly)
  Downloading decorator-4.0.11-py2.py3-none-any.whl
Collecting nbformat>=4.2 (from plotly)
  Downloading nbformat-4.3.0-py2.py3-none-any.whl (154kB)
[K    100% |████████████████████████████████| 163kB 1.6MB/s ta 0:00:01
[?25hCollecting pytz (from plotly)
  Downloading pytz-2017.2-py2.py3-none-any.whl (484kB)
[K    100% |████████████████████████████████| 491kB 1.1MB/s ta 0:00:01
[?25hCollecting requests (from plotly)
  Downloading requests-2.16.3-py2.py3-none-any.whl (86kB)
[K    100% |████████████████████████████████| 92kB 750kB/s ta 0:00:011
[?25hRequirement already up-to-date: six in /opt/conda/lib/python3.5/site-packages (from plotly)
Collecting traitlets>=4.1 (from nbformat>=4.2->plotly)
  Downloading traitlets-4.3.2-py2.py3-none-any.whl (

In [2]:
! pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.18.1-cp35-cp35m-manylinux1_x86_64.whl (11.7MB)
[K    100% |████████████████████████████████| 11.7MB 119kB/s ta 0:00:01   43% |██████████████                  | 5.1MB 2.3MB/s eta 0:00:03    62% |███████████████████▉            | 7.2MB 1.5MB/s eta 0:00:04    66% |█████████████████████▏          | 7.7MB 1.5MB/s eta 0:00:03    82% |██████████████████████████▎     | 9.6MB 373kB/s eta 0:00:06
[?25hInstalling collected packages: scikit-learn
  Found existing installation: scikit-learn 0.17.1
    Uninstalling scikit-learn-0.17.1:
      Successfully uninstalled scikit-learn-0.17.1
Successfully installed scikit-learn-0.18.1
[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# Import statements required for Plotly 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls



## 1) Data Exploration

In [4]:
att_df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
att_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [5]:
att_df.tail()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8
1469,34,No,Travel_Rarely,628,Research & Development,8,3,Medical,1,2068,...,1,80,0,6,3,4,4,3,1,2


In [6]:
att_df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [7]:
# Looking for NaN
att_df.isnull().any()

Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesL

### 1.1) Correlation between features

In [8]:
att_df.corr()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,0.010661,-0.001686,0.208034,,-0.010145,0.010146,0.024287,0.02982,0.509604,...,0.053535,,0.03751,0.680381,-0.019621,-0.02149,0.311309,0.212901,0.216513,0.202089
DailyRate,0.010661,1.0,-0.004985,-0.016806,,-0.05099,0.018355,0.023381,0.046135,0.002966,...,0.007846,,0.042143,0.014515,0.002453,-0.037848,-0.034055,0.009932,-0.033229,-0.026363
DistanceFromHome,-0.001686,-0.004985,1.0,0.021042,,0.032916,-0.016075,0.031131,0.008783,0.005303,...,0.006557,,0.044872,0.004628,-0.036942,-0.026556,0.009508,0.018845,0.010029,0.014406
Education,0.208034,-0.016806,0.021042,1.0,,0.04207,-0.027128,0.016775,0.042438,0.101589,...,-0.009118,,0.018422,0.14828,-0.0251,0.009819,0.069114,0.060236,0.054254,0.069065
EmployeeCount,,,,,,,,,,,...,,,,,,,,,,
EmployeeNumber,-0.010145,-0.05099,0.032916,0.04207,,1.0,0.017621,0.035179,-0.006888,-0.018519,...,-0.069861,,0.062227,-0.014365,0.023603,0.010309,-0.01124,-0.008416,-0.009019,-0.009197
EnvironmentSatisfaction,0.010146,0.018355,-0.016075,-0.027128,,0.017621,1.0,-0.049857,-0.008278,0.001212,...,0.007665,,0.003432,-0.002693,-0.019359,0.027627,0.001458,0.018007,0.016194,-0.004999
HourlyRate,0.024287,0.023381,0.031131,0.016775,,0.035179,-0.049857,1.0,0.042861,-0.027853,...,0.00133,,0.050263,-0.002334,-0.008548,-0.004607,-0.019582,-0.024106,-0.026716,-0.020123
JobInvolvement,0.02982,0.046135,0.008783,0.042438,,-0.006888,-0.008278,0.042861,1.0,-0.01263,...,0.034297,,0.021523,-0.005533,-0.015338,-0.014617,-0.021355,0.008717,-0.024184,0.025976
JobLevel,0.509604,0.002966,0.005303,0.101589,,-0.018519,0.001212,-0.027853,-0.01263,1.0,...,0.021642,,0.013984,0.782208,-0.018191,0.037818,0.534739,0.389447,0.353885,0.375281


In [9]:
# Drop the EmployeeCount column from df- we don't want it because the employee count is 1 for everyone
unique_counts = att_df['EmployeeCount'].unique()
att_df = att_df.drop(['EmployeeCount'], axis=1)
# Drop Standard Hours column from df- we don't want it because it is 80 for everybody.
unique_hrs = att_df['StandardHours'].unique()
att_df = att_df.drop(['StandardHours'], axis=1)

In [10]:
# creating a list of only numerical values
numerical = [u'Age', u'DailyRate', u'DistanceFromHome', u'Education', u'EmployeeNumber', u'EnvironmentSatisfaction',
       u'HourlyRate', u'JobInvolvement', u'JobLevel', u'JobSatisfaction',
       u'MonthlyIncome', u'MonthlyRate', u'NumCompaniesWorked',
       u'PercentSalaryHike', u'PerformanceRating', u'RelationshipSatisfaction',
       u'StockOptionLevel', u'TotalWorkingYears',
       u'TrainingTimesLastYear', u'WorkLifeBalance', u'YearsAtCompany',
       u'YearsInCurrentRole', u'YearsSinceLastPromotion',
       u'YearsWithCurrManager']
data = [
    go.Heatmap(
        z= att_df[numerical].astype(float).corr().values, # Generating the Pearson correlation
        x=att_df[numerical].columns.values,
        y=att_df[numerical].columns.values,
        colorscale='Portland',
        reversescale = False,
        text = True ,
        opacity = 1.0
        
    )
]


layout = go.Layout(
    title = 'Correlation Between Num Features ',
    xaxis = dict(ticks='', nticks=36),
    yaxis = dict(ticks='' ),
    width = 900, height = 700,
    
)


fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='heatmap_corr')

###### What the Correlation Matrix Tells Us
When we perform our analysis, we don't want to include features that are highly correlated with each other. We see that most features are not correlated with each other, but a few of them are highly correlated. For instance, years with current manager and years at the company are going to be understandably correlated, for the longer that you have been with your current manager, the necessarily longer you've been at the company. Likewise, if your job level is higher, you're more likely to get a higher monthly income. We will have to further analyze which of these features we want to include in our analysis. With all of these features, we can consider applying principal components analysis to reduce the redundancy between correlated features.

## 2) Numerical Encoding/Feature Engineering

In [11]:
# to store categorical data
categorical = []
for col, value in att_df.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

# Store the numerical columns in a list numerical
numerical = att_df.columns.difference(categorical)

In [12]:
# Store the categorical data in a dataframe called attrition_cat
att_cat_df = att_df[categorical]
att_cat_df = att_cat_df.drop(['Attrition'], axis=1) # Dropping the target column

In [13]:
att_cat_df = pd.get_dummies(att_cat_df)
att_cat_df.head(3)

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,0,1,1,0,1
1,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1


In [14]:
# Store the numerical features to a dataframe attrition_num
att_num_df = att_df[numerical]
# Concat the two dataframes together columnwise
att_fin_df = pd.concat([att_num_df, att_cat_df], axis=1)

##### Preparing the Target

In [15]:
# Define a dictionary for the target mapping
target_map = {'Yes':1, 'No':0}
# Use the pandas apply method to numerically encode our attrition target variable
target = att_df["Attrition"].apply(lambda x: target_map[x])
target.head(3)

0    1
1    0
2    1
Name: Attrition, dtype: int64

In [16]:
data = [go.Bar(
            x=att_df["Attrition"].value_counts().index.values,
            y= att_df["Attrition"].value_counts().values
    )]

py.iplot(data, filename='basic-bar')

By plotitng the values of the target variable, we see that most employees in the dataset have no attrition. Thus, we face an imbalance in our data set. 

## 3) ML Models

In [17]:
import sklearn.model_selection as ms
# Split data into train and test sets as well as for validation and testing
X = att_fin_df.as_matrix()

y = target

# Now we split...
X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2, random_state=42)

to account for undersampling of Yes for attrition... use SMOTE algorithm to sample more from the edge/minority cases... http://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

In [18]:
oversampler=SMOTE(random_state=0)
X_smote_train, y_smote_train = oversampler.fit_sample(X_train,y_train)

## Classifier Evaluations

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

# Results, as a list of dictionaries
classifier_results = []

In [20]:
# Code for creating and testing classifiers

## Sample depth-xxx decision trees
dt_model = DecisionTreeClassifier(max_depth=1, random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'DecTree', 'Depth': 1.0, 'Score': test_score})

dt_model = DecisionTreeClassifier(max_depth=2, random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'DecTree', 'Depth': 2.0, 'Score': test_score})

dt_model = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'DecTree', 'Depth': 3.0, 'Score': test_score})

dt_model = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'DecTree', 'Depth': 4.0, 'Score': test_score})

dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'DecTree', 'Depth': 5.0, 'Score': test_score})

## Logistic Regressions l1 and l2
dt_model = LogisticRegression(solver='liblinear', penalty='l1', random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)
classifier_results.append({'Classifier': 'LogReg-L1', 'Score': test_score})

dt_model = LogisticRegression(solver='liblinear', penalty='l2', random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)
classifier_results.append({'Classifier': 'LogReg-L2', 'Score': test_score})

#dt_model = LogisticRegression(solver='liblinear', penalty='l2')
dt_model = SVC(random_state=42)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)
classifier_results.append({'Classifier': 'SVC', 'Score': test_score})

In [21]:
classifier_results

[{'Classifier': 'DecTree', 'Depth': 1.0, 'Score': 0.70068027210884354},
 {'Classifier': 'DecTree', 'Depth': 2.0, 'Score': 0.70068027210884354},
 {'Classifier': 'DecTree', 'Depth': 3.0, 'Score': 0.85034013605442171},
 {'Classifier': 'DecTree', 'Depth': 4.0, 'Score': 0.81972789115646261},
 {'Classifier': 'DecTree', 'Depth': 5.0, 'Score': 0.82993197278911568},
 {'Classifier': 'LogReg-L1', 'Score': 0.74829931972789121},
 {'Classifier': 'LogReg-L2', 'Score': 0.75170068027210879},
 {'Classifier': 'SVC', 'Score': 0.86734693877551017}]

In [22]:
#dt_model = LogisticRegression(solver='liblinear', penalty='l2')
dt_model = SVC()
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)
classifier_results.append({'Classifier': 'SVC', 'Score': test_score})

In [23]:
classifier_results

[{'Classifier': 'DecTree', 'Depth': 1.0, 'Score': 0.70068027210884354},
 {'Classifier': 'DecTree', 'Depth': 2.0, 'Score': 0.70068027210884354},
 {'Classifier': 'DecTree', 'Depth': 3.0, 'Score': 0.85034013605442171},
 {'Classifier': 'DecTree', 'Depth': 4.0, 'Score': 0.81972789115646261},
 {'Classifier': 'DecTree', 'Depth': 5.0, 'Score': 0.82993197278911568},
 {'Classifier': 'LogReg-L1', 'Score': 0.74829931972789121},
 {'Classifier': 'LogReg-L2', 'Score': 0.75170068027210879},
 {'Classifier': 'SVC', 'Score': 0.86734693877551017},
 {'Classifier': 'SVC', 'Score': 0.86734693877551017}]

#### Now with ensembles...

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier

classifier_results = []

In [25]:
dt_model = RandomForestClassifier(n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'RandomForest', 'Score': test_score, 'Count': 31})

dt_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Bag-DecTree', 'Score': test_score, 'Count': 31})

dt_model = BaggingClassifier(base_estimator=LogisticRegression(solver='liblinear', penalty='l1', random_state=42), n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Bag-LogReg-L1', 'Score': test_score, 'Count': 31})

dt_model = BaggingClassifier(base_estimator=LogisticRegression(solver='liblinear', penalty='l2', random_state=42), n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Bag-LogReg-L2', 'Score': test_score, 'Count': 31})

dt_model = BaggingClassifier(base_estimator=SVC(random_state=42), n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Bag-SVM', 'Score': test_score, 'Count': 31})

dt_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=42), n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Boost-DecTree', 'Score': test_score, 'Count': 31})

dt_model = AdaBoostClassifier(base_estimator=LogisticRegression(solver='liblinear', penalty='l1', random_state=42), n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Boost-LogReg-L1', 'Score': test_score, 'Count': 31})

dt_model = AdaBoostClassifier(base_estimator=LogisticRegression(solver='liblinear', penalty='l2', random_state=42), n_estimators=31, random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Boost-LogReg-L2', 'Score': test_score, 'Count': 31})

dt_model = AdaBoostClassifier(base_estimator=SVC(random_state=42), n_estimators=31, algorithm = "SAMME", random_state=314)
dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'Boost-SVM', 'Score': test_score, 'Count': 31})

In [26]:
classifier_results

[{'Classifier': 'RandomForest', 'Count': 31, 'Score': 0.8571428571428571},
 {'Classifier': 'Bag-DecTree', 'Count': 31, 'Score': 0.84693877551020413},
 {'Classifier': 'Bag-LogReg-L1', 'Count': 31, 'Score': 0.75510204081632648},
 {'Classifier': 'Bag-LogReg-L2', 'Count': 31, 'Score': 0.74829931972789121},
 {'Classifier': 'Bag-SVM', 'Count': 31, 'Score': 0.86734693877551017},
 {'Classifier': 'Boost-DecTree', 'Count': 31, 'Score': 0.78231292517006801},
 {'Classifier': 'Boost-LogReg-L1', 'Count': 31, 'Score': 0.56802721088435371},
 {'Classifier': 'Boost-LogReg-L2', 'Count': 31, 'Score': 0.76190476190476186},
 {'Classifier': 'Boost-SVM', 'Count': 31, 'Score': 0.86734693877551017}]

In [68]:
# Gradient Boosting Parameters
gb_params ={
    'n_estimators': 500,
    'max_features': 0.9,
    'learning_rate' : 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'subsample': 1,
    'max_features' : 'sqrt',
    'random_state' : 0,
    'verbose': 0
}

dt_model = GradientBoostingClassifier(**gb_params)

dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'GradientBoosting', 'Score': test_score, 'Count': 500})

In [87]:
dt_model = RandomForestClassifier(n_estimators=850, max_depth = 9, min_samples_leaf = 2, max_features = 'sqrt')

dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)

classifier_results.append({'Classifier': 'RandomForest', 'Score': test_score, 'Count': 850})

In [88]:
classifier_results

[{'Classifier': 'RandomForest', 'Count': 31, 'Score': 0.8571428571428571},
 {'Classifier': 'Bag-DecTree', 'Count': 31, 'Score': 0.84693877551020413},
 {'Classifier': 'Bag-LogReg-L1', 'Count': 31, 'Score': 0.75510204081632648},
 {'Classifier': 'Bag-LogReg-L2', 'Count': 31, 'Score': 0.74829931972789121},
 {'Classifier': 'Bag-SVM', 'Count': 31, 'Score': 0.86734693877551017},
 {'Classifier': 'Boost-DecTree', 'Count': 31, 'Score': 0.78231292517006801},
 {'Classifier': 'Boost-LogReg-L1', 'Count': 31, 'Score': 0.56802721088435371},
 {'Classifier': 'Boost-LogReg-L2', 'Count': 31, 'Score': 0.76190476190476186},
 {'Classifier': 'Boost-SVM', 'Count': 31, 'Score': 0.86734693877551017},
 {'Classifier': 'GradientBoosting',
  'Count': 500,
  'Score': 0.89455782312925169},
 {'Classifier': 'RandomForest', 'Count': 850, 'Score': 0.87755102040816324},
 {'Classifier': 'GradientBoosting',
  'Count': 500,
  'Score': 0.87414965986394555},
 {'Classifier': 'GradientBoosting',
  'Count': 500,
  'Score': 0.88095

#### Random Forest Evaluation
We see that of these models, the SVC, Random Forest and GBM are best. We will look further into them.

In [30]:
ft_impt = dt_model.feature_importances_
ft_impt

array([ 0.01781667,  0.01426587,  0.01460768,  0.01029761,  0.01292805,
        0.017833  ,  0.01274846,  0.02629046,  0.02545416,  0.03679612,
        0.03538948,  0.01574175,  0.01708542,  0.00982275,  0.00247099,
        0.0119931 ,  0.04865817,  0.0201915 ,  0.01035294,  0.01164491,
        0.02014634,  0.02526478,  0.01201371,  0.02286512,  0.00270042,
        0.01977386,  0.01033842,  0.0015615 ,  0.02668065,  0.02409372,
        0.00078308,  0.00844489,  0.00393884,  0.01461028,  0.00120413,
        0.003715  ,  0.0075379 ,  0.00785594,  0.00287847,  0.0018865 ,
        0.01456926,  0.00092297,  0.00319738,  0.00067708,  0.00609039,
        0.01094711,  0.01184301,  0.02020363,  0.03803719,  0.07589142,
        0.        ,  0.11811335,  0.10882456])

In [31]:
x = att_fin_df.columns.values
x

array(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Human Resources', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Human Resources',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'Gender_Female', 'Gender_Male',
       'JobRole_Healthcare Representative', 'J

The random forest evaluation also returns feature ranking, so we can see which features are most significant. We can plot this as follows:

In [32]:
# Scatter plot 
trace = go.Scatter(
    y = dt_model.feature_importances_,
    x = att_fin_df.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = dt_model.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = att_fin_df.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')

In [89]:
d = {'Feature': att_fin_df.columns.values, 'Feature Importance': dt_model.feature_importances_}
ft_impts = pd.DataFrame(data=d)
ft_impts = ft_impts.sort_values(by = 'Feature Importance')
ft_impts

Unnamed: 0,Feature,Feature Importance
50,Over18_Y,0.0
43,JobRole_Research Director,0.000739
30,EducationField_Human Resources,0.000746
41,JobRole_Manager,0.001151
34,EducationField_Other,0.001168
27,Department_Human Resources,0.001662
39,JobRole_Human Resources,0.001867
24,BusinessTravel_Non-Travel,0.002344
38,JobRole_Healthcare Representative,0.00266
14,PerformanceRating,0.002689


#### Most important features
We see here that the most important features are centered on whether the employee works overtime (yes or no), the employee's relationship status (single, married or divorced), job satisfaction, monthly income, and stock option level.

These results make sense, as 

#### SVC Evaluation

In [85]:
dt_model = LinearSVC()
dt_model.fit(X_smote_train, y_smote_train)
coefs = dt_model.coef_ 

coefs = [i * 1000 for i in coefs]
coefs = coefs[0]

impt_coefs = [abs(i) for i in coefs]

Using these coefficients, we can plot out the feature importances just like we did for the random forest classifier.

In [76]:
# Scatter plot 
trace = go.Scatter(
    y = coefs,
    x = att_fin_df.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = coefs,
        colorscale='Portland',
        showscale=True
    ),
    text = att_fin_df.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'SVC Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')

In [86]:
d = {'Feature': att_fin_df.columns.values, 'Feature Importance': impt_coefs}
ft_impts = pd.DataFrame(data=d)
ft_impts = ft_impts.sort_values(by = 'Feature Importance')
ft_impts

Unnamed: 0,Feature,Feature Importance
43,JobRole_Research Director,0.002032
30,EducationField_Human Resources,0.015671
41,JobRole_Manager,0.017565
1,DailyRate,0.022091
11,MonthlyRate,0.04504
39,JobRole_Human Resources,0.052325
27,Department_Human Resources,0.053059
10,MonthlyIncome,0.065166
28,Department_Research & Development,0.073662
5,EnvironmentSatisfaction,0.085945


From the coefficients, we get a sense of which features are significant in the SVC classifier. We see that like the random forest classifier, overtime has a large play on whether or not the employee has attrition. Moreover, we see again that the relationship status is also important. Nonetheless, there are many more features that were not as significant under the random forest classifier that are now much more significant under the SVC classifier; these include age, distance from home, job satisfaction, number companies worked, percent salary hike, performance rating, stock option level, total working years, years with company/current role/current manager, and years since last promotion. 

[explain what the coef #s mean, why they make sense, negative, zero, positive...]

#### Gradient Booster Classifier Evaluation

In [49]:
dt_model = GradientBoostingClassifier(**gb_params)

dt_model.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)
test_score = dt_model.score(X_test, y_test)
test_score

0.89455782312925169

In [36]:
# Scatter plot 
trace = go.Scatter(
    y = dt_model.feature_importances_,
    x = att_fin_df.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = dt_model.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = att_fin_df.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Model Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')

[discuss most important features, why they make sense as important, how they differ from the other two classifier feature rankings...]

In [74]:
d = {'Feature': att_fin_df.columns.values, 'Feature Importance': dt_model.feature_importances_}
ft_impts = pd.DataFrame(data=d)
ft_impts = ft_impts.sort_values(by = 'Feature Importance')
ft_impts

Unnamed: 0,Feature,Feature Importance
50,Over18_Y,0.0
41,JobRole_Manager,0.000468
43,JobRole_Research Director,0.000498
30,EducationField_Human Resources,0.000692
27,Department_Human Resources,0.000922
14,PerformanceRating,0.001144
39,JobRole_Human Resources,0.001292
38,JobRole_Healthcare Representative,0.001317
42,JobRole_Manufacturing Director,0.001456
34,EducationField_Other,0.001669


### A further look at reducing features for the GBC

In [47]:
clf = GradientBoostingClassifier(**gb_params)
clf.fit(X_smote_train, y_smote_train)
y_pred_test = dt_model.predict(X_test)