# This notebook is set up to run a single data file through developed pipeline that preprocesses, splits, and train/tests data against ML algorithms. The team developed a the tensor flow boosted tree classifier, hyperparameter search on Random Forest, Gradient Boost and Xtreme Gradient Boost and finally Auto Keras. All ML Outputs have a report that shows the ROC curve, confusion matrix, statistical performance metrics and finally the feature importance visuals.



# Data for this notebook is stored in the Client Data directory of VTSD21's Google Drive and can be accessed when that folder is made a shortcut to the user's personal drive

# Authorization and accessing the data from the Client Data Folder


# Security Access and Import Libraries

In [1]:
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [5]:
!pip install -q -U keras-tuner
!pip install autokeras

[?25l[K     |█████▏                          | 10kB 18.9MB/s eta 0:00:01[K     |██████████▍                     | 20kB 23.2MB/s eta 0:00:01[K     |███████████████▋                | 30kB 17.2MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 10.6MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 8.8MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 9.4MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 4.4MB/s 
[?25h  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Building wheel for terminaltables (setup.py) ... [?25l[?25hdone
Collecting autokeras
[?25l  Downloading https://files.pythonhosted.org/packages/09/12/cf698586ccc8245f08d1843dcafb65b064a2e9e2923b889dc58e1019f099/autokeras-1.0.12-py3-none-any.whl (164kB)
[K     |████████████████████████████████| 174kB 8.6MB/s 
Installing collected packages: autokeras
Successfully installed autokeras-1.0.12


In [6]:
#libraries
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import autokeras as ak
import kerastuner as kt
import matplotlib.pyplot as plt
%matplotlib inline


# Import statements required for Plotly 
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
tf.random.set_seed(123)

# Reading in the Data based on 7 files in directory

In [7]:
colab_path = '/content/drive/MyDrive/Client Data/'

pathDict = {'K_Survey':'surveywithtarget.csv',
            'K_Survey_Org':'surveywithtargetandorg.csv',
            'Q1_2018':'bianual_survey_1_2018_processed.csv',
            'Q3_2018':'bianual_survey_3_2018_processed.csv',
            'Q1_2019':'bianual_survey_1_2019_processed.csv',
            'Q3_2019':'bianual_survey_3_2019_processed.csv',
            'Q1_2020':'bianual_survey_1_2020_processed.csv'}

#consider training on one quarter and then testing on the next quarter

def setFilePath(filename):
  return pathDict[filename]

# Call the path of the file you want to read in

In [8]:
import ipywidgets as widgets

w = widgets.Dropdown(
    options=pathDict.keys(),
    description='File Name:',
    disabled=False,
)
def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("File set to %s" % change['new'])
        return change

w.observe(on_change)
display(w)

Dropdown(description='File Name:', options=('K_Survey', 'K_Survey_Org', 'Q1_2018', 'Q3_2018', 'Q1_2019', 'Q3_2…

File set to Q3_2018


In [9]:
FileName = w.value
path = setFilePath(FileName)

# Code to handle the nuances of each survey in a general way and format them to fit the ML algos below

In [10]:
q_cols = ['general empl satisfaction',
 'recommend empl unit to friend',
 'motivation of direct colleagues you are working with day to day? *Top 3',
 'My direct colleagues are coping well with the change and transformation in our division/function',
 'People in my team frequently go above and beyond the requirements of the job',
 'I am proud working for my company and gladly tell people about it',
 'I believe strongly in and support the future direction of my company',
 'I trust that my company takes action that balances the best interests of our people, business and clients',
 "My company's handling of this year’s challenges leaves me confident in our future success",
 "My company's future direction leaves me confident about my career opportunities here",
 'In my team we continuously use client feedback to improve our products/services',
 'In my team we have an ongoing dialogue about our clients and their requirements and expectations',
 'Compared to last year, our clients now view us as:',
 'My immediate manager creates an atmosphere of openness and trust',
 'I have confidence in my immediate manager',
 'I have confidence in the global senior management of [division]',
 'I have confidence in the senior management of [FUNCTION: Finance, HR, Technology, Operations, Marketing]',
 'I have confidence in my local/country senior management',
 'As a member of [FUNCTION: Finance, HR, Technology, Operations, Marketing] I am familiar with the overall objectives & strategies of my function',
 'I am familiar with the overall objectives & strategies of [division]',
 'I can clearly see how my own work contributes to the overall objectives & strategies of [division]',
 'In my team we work towards clear objectives',
 'In my team we make decisions rapidly when it is necessary',
 'In my team actions are taken quickly when decisions have been made',
 'Please rate the cooperation between different units within [division]',
 'Please rate the cooperation within [FUNCTION: Finance, HR, Technology, Operations, Marketing]',
 'Please rate the cooperation across the company as a whole',
 'In my team new ideas receive very strong support and encouragement',
 'I feel I can make my own decisions concerning my work',
 "I have seen action taken based on the results of last year's survey",
 'In my company diversity of skills, experiences, background and ways of working are recognized and appreciated',
 'My immediate manager treats all team members fairly, regardless of age, gender identity, sex, family status, race, national origin, nationality, religion, disability or sexual orientation',
 'My immediate manager effectively works with people who are different from them to achieve business results',
 'In my team we treat each other fairly, regardless of age, sex, gender identity, family status, race, national origin, nationality, religion, disability or sexual orientation',
 'I feel included in my team',
 'My company takes an interest in my well-being',
 'My work schedule allows me sufficient flexibility to meet my personal/family needs',
 'Career opportunities always go to the most qualified person regardless of age, gender identity, sex, family status, race, national origin, nationality, religion, disability or sexual orientation',
 'Is there anything else you want to share with us?']

In [11]:
surveyFile = colab_path + path
cat = ['object']
num = ['float']
if FileName in 'K_Survey' or FileName in 'K_Survey_Org':
  surveyPd = pd.read_csv(surveyFile,header=None).fillna(0)
  questions = q_cols
  #questions = surveyPd.columns 
  surveyPd.columns = [str(x) for x in list(surveyPd.columns)]
elif 'Q' in FileName:
  surveyPd = pd.read_csv(surveyFile,header=0,index_col=0)
  x1 = surveyPd.select_dtypes(include=cat).fillna('blank').applymap(str)
  x2 = surveyPd.select_dtypes(include=num).fillna(0).applymap(float)
  x3 = surveyPd[surveyPd.columns[-1]]
  surveyPd = pd.concat([x1,x2,x3],axis=1)
  import re 
  #Removing punctuations in string 
  #Using regex 
  questions = [str(y) for y in surveyPd.select_dtypes(include=num).columns] #changing this so the categorical column names are included in the questions var
  surveyPd.columns = [re.sub(r'[^\w\s]', '',s)  for s in list(surveyPd.columns)]
  l1 = [x.replace(" ","") for x in list(surveyPd.select_dtypes(include=cat).columns)]
  l2 = [str(y) for y in list(range(len(surveyPd.select_dtypes(include=num).columns)))]
  l3 = [surveyPd.columns[-1]]
  surveyPd.columns = l1 + l2 + l3
target = surveyPd[surveyPd.columns[-1]]
if target.dtype != bool:
    raise Exception('Last column is not formatted as the Target Boolean Column'.format(target))
    print("File " + FileName + " is formatted incorrectly for python script")
else:
    print("File " + FileName + " is formatted correctly for python script")
surveyPd

File Q3_2018 is formatted correctly for python script


Unnamed: 0,BusinessUnit,TenureRange,Location,EmployeeType,Gender,Generation,TermType,TermReason,QuarterTermed,DateSurveywasCompleted,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,target
0,GSS,blank,Manila,Contractor,blank,blank,blank,blank,blank,8/12/18 17:03,5.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,5.0,5.0,4.0,4.0,3.0,10.0,False
1,Sales,blank,Manila,Contractor,blank,blank,blank,blank,blank,8/12/18 17:03,4.0,5.0,4.0,4.0,3.0,5.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,5.0,9.0,False
2,Cloud Operations,blank,Manila,Contractor,blank,blank,blank,blank,blank,8/12/18 17:05,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,9.0,False
3,HR,blank,Manila,Contractor,blank,blank,blank,blank,blank,8/12/18 17:10,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,11.0,False
4,GSS,blank,Manila,Contractor,blank,blank,blank,blank,blank,8/12/18 17:10,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,3.0,3.0,6.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3545,Innovation,blank,St. Petersburg,Contractor,blank,blank,blank,blank,blank,blank,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3546,Finance,blank,Manila,Contractor,blank,blank,blank,blank,blank,blank,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3547,Sales,blank,Manila,Contractor,blank,blank,blank,blank,blank,blank,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3548,Innovation,blank,St. Petersburg,Contractor,blank,blank,blank,blank,blank,blank,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [12]:
questions


['Leadership clearly communicates the direction in which the company is moving.',
 'I am confident about the future performance of the company.',
 'I have confidence in the decisions made by the leadership at the company.',
 'I have a clear understanding of the priorities for the company.',
 'Communication between departments at the company is good.',
 "Leadership at my location shares insights and information that makes me feel connected to the company's strategy.",
 'Leadership acts in accordance with the values of the company.',
 'The people I work with cooperate to get the job done.',
 'My direct manager communicates what is expected of me.',
 'My direct manager gives me quarterly feedback on my performance.',
 'The feedback I receive from my direct manager helps me improve my performance.',
 'My direct manager recognizes me for my contributions on the job.',
 'I am comfortable discussing concerns with my direct manager.',
 'My job makes good use of my skills and abilities.',
 'My 

# Code to preprocess columns that are mostly blank by dropping columns if the average non-blanks is less than a specific threshold

In [13]:
threshold = 0.3
droplist = []
for col in list(surveyPd.select_dtypes(include = num)):
  if (surveyPd[col] > 0).mean() < threshold:
    droplist.append(col)
    surveyPd = surveyPd.drop(col,axis=1)

# Columns that were dropped because mostly blank

In [14]:
droplist

[]

In [15]:
#using sklearn to split the data
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.metrics import classification_report, plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve



X_train, X_test, y_train, y_test = train_test_split(surveyPd.iloc[:,0:-1], target, test_size=0.2, random_state = 42) #need to change this if we want to only include questions
#right now this is including all of the categorical columns for the datasets from Company #2



# Running Tensor Flow's Gradient Boost Classifier

# Pre Processing Data Using One Hot Encoder

In [None]:
fc = tf.feature_column
cat = ['object']
num = ['float']
CATEGORICAL_COLUMNS = surveyPd.select_dtypes(include=cat).columns
NUMERIC_COLUMNS = surveyPd.select_dtypes(include=num).columns

def one_hot_cat_column(feature_name, vocab):
  return fc.indicator_column(
      fc.categorical_column_with_vocabulary_list(feature_name,
                                                 vocab))
feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  # Need to one-hot encode categorical features.
  vocabulary = X_train[feature_name].unique()
  feature_columns.append(one_hot_cat_column(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(fc.numeric_column(feature_name,
                                           dtype=tf.float32))

# Build the input Pipeline

In [18]:
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
    if shuffle:
      dataset = dataset.shuffle(NUM_EXAMPLES)
    # For training, cycle thru dataset as many times as need (n_epochs=None).
    dataset = (dataset
      .repeat(n_epochs)
      .batch(NUM_EXAMPLES))
    return dataset
  return input_fn

# Training and evaluation input functions.
train_input_fn = make_input_fn(X_train, y_train)
eval_input_fn = make_input_fn(X_test, y_test, shuffle=False, n_epochs=1)

# Train the model

In [19]:
params = {
  'n_trees': 50,
  'max_depth': 3,
  'n_batches_per_layer': 1,
  # You must enable center_bias = True to get DFCs. This will force the model to
  # make an initial prediction before using any features (e.g. use the mean of
  # the training labels for regression or log odds for classification when
  # using cross entropy loss).
  'center_bias': True
}

est = tf.estimator.BoostedTreesClassifier(feature_columns, **params)
# Train model.
est.train(train_input_fn, max_steps=100)

# Evaluation.
results = est.evaluate(eval_input_fn)
#clear_output()
pd.Series(results).to_frame()

NameError: ignored

For performance reasons, when your data fits in memory, we recommend use the arg train_in_memory=True in the tf.estimator.BoostedTreesClassifier function. However if training time is not of a concern or if you have a very large dataset and want to do distributed training, use the tf.estimator.BoostedTrees API shown above.

In [None]:
in_memory_params = dict(params)
in_memory_params['n_batches_per_layer'] = 1
# In-memory input_fn does not use batching.
def make_inmemory_train_input_fn(X, y):
  y = np.expand_dims(y, axis=1)
  def input_fn():
    return dict(X), y
  return input_fn
train_input_fn = make_inmemory_train_input_fn(X_train, y_train) #Do we need to restrict to the questions?

# Train the model.
est = tf.estimator.BoostedTreesClassifier(
    feature_columns, 
    train_in_memory=True, 
    **in_memory_params)

est.train(train_input_fn)
# Evaluation.
results = est.evaluate(eval_input_fn)
#clear_output()
pd.Series(results).to_frame()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns_colors = sns.color_palette('colorblind')
pred_dicts = list(est.experimental_predict_with_explanations(eval_input_fn))

# Feature Importance and Model Interpretability

In [None]:
importances = est.experimental_feature_importances(normalize=True)
df_imp = pd.Series(importances)

# Plot Feature Importance

In [None]:
# Scatter plot 
trace = go.Scatter(
    y = df_imp.values,
    x = df_imp.index,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = df_imp.values,
        colorscale='Portland',
        showscale=True
    ),
    text = surveyPd.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Tensor Flow Gradient Boost Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 1,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatterrf1')
fig.show(renderer="colab")

In [None]:
#Here are the most important questions according to the GB algorithm

gb_importances = df_imp.values
gb_index = df_imp.index
new_index = gb_index

featureimportance_gb = pd.DataFrame({'Question Number': new_index, 'Feature Importance': gb_importances}, columns=['Question Number', 'Feature Importance'])
gb_df = featureimportance_gb.sort_values(by=['Feature Importance'], ascending=False)
gb_df

gb_five = gb_df.head(n=5)
print(gb_five)

question_column = gb_five.loc[:,'Question Number']

imp_gb_questions = question_column.values






In [None]:
#returning the 5 most important questions

print("The most important factors/questions are:  \n", imp_gb_questions)

# for i in imp_gb_questions:
#   print(questions[int(i)]) #q_cols is for company #1

# Logit Regressor

In [20]:
#consider normalizing the scores with z-scores- only use training set, not the test 
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train.select_dtypes(include=num), y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
y_pred = logreg.predict(X_test.select_dtypes(include=num))
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test.select_dtypes(include=num), y_test)))

Accuracy of logistic regression classifier on test set: 0.99


In [22]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

sns.heatmap(confusion_matrix, annot=True, yticklabels=label_names, xticklabels=label_names, fmt="g")


[[706   0]
 [  4   0]]


NameError: ignored

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test.select_dtypes(include=num)))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test.select_dtypes(include=num))[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

# Machine Learning Benchmarks

# Hyper Parameter Search using Sklearn Grid Search Method for RF,
Does RF only take numeric inputs?

In [23]:
#Try different loss functions check sample weight too
import sklearn
from sklearn.model_selection import GridSearchCV
rf_parameters = {'n_jobs':[1], 
                 'max_depth':[3,5],
                 'min_samples_split':[0.05], 
                 'class_weight':["balanced",'balanced_subsample'],
                 'n_estimators':[50,100,500],
                 'max_features':[0.3,'auto'], 
                 'random_state':[42]
                 }
svc = sklearn.ensemble.RandomForestClassifier()
clf = GridSearchCV(svc, rf_parameters)
clf.fit(X_train.select_dtypes(include=num), y_train)
rf_results = pd.DataFrame.from_dict(clf.cv_results_)

# Random Forest Hyper Parameter Results

In [24]:
newdf = pd.DataFrame.from_dict(clf.cv_results_['params'])
result = pd.concat([newdf,rf_results], axis=1, join='inner')
result.sort_values('rank_test_score', ascending=True)

Unnamed: 0,class_weight,max_depth,max_features,min_samples_split,n_estimators,n_jobs,random_state,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators,param_n_jobs,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,balanced,5,auto,0.05,500,1,42,0.935963,0.009053,0.058562,0.000164,balanced,5,auto,0.05,500,1,42,"{'class_weight': 'balanced', 'max_depth': 5, '...",0.955986,0.966549,0.971831,0.97007,0.985915,0.97007,0.009643,1
22,balanced_subsample,5,auto,0.05,100,1,42,0.271062,0.00612,0.013102,0.000207,balanced_subsample,5,auto,0.05,100,1,42,"{'class_weight': 'balanced_subsample', 'max_de...",0.955986,0.959507,0.97007,0.975352,0.989437,0.97007,0.011941,1
7,balanced,5,0.3,0.05,100,1,42,0.208009,0.008184,0.014103,0.001385,balanced,5,0.3,0.05,100,1,42,"{'class_weight': 'balanced', 'max_depth': 5, '...",0.955986,0.959507,0.97007,0.977113,0.987676,0.97007,0.011572,1
23,balanced_subsample,5,auto,0.05,500,1,42,1.332205,0.012672,0.063359,0.003258,balanced_subsample,5,auto,0.05,500,1,42,"{'class_weight': 'balanced_subsample', 'max_de...",0.955986,0.964789,0.971831,0.97007,0.987676,0.97007,0.010386,1
19,balanced_subsample,5,0.3,0.05,100,1,42,0.283162,0.001273,0.014414,0.001603,balanced_subsample,5,0.3,0.05,100,1,42,"{'class_weight': 'balanced_subsample', 'max_de...",0.957746,0.963028,0.966549,0.977113,0.984155,0.969718,0.009604,5
10,balanced,5,auto,0.05,100,1,42,0.19118,0.010025,0.013716,0.001017,balanced,5,auto,0.05,100,1,42,"{'class_weight': 'balanced', 'max_depth': 5, '...",0.955986,0.955986,0.973592,0.973592,0.987676,0.969366,0.012075,6
8,balanced,5,0.3,0.05,500,1,42,1.007239,0.010538,0.062341,0.004164,balanced,5,0.3,0.05,500,1,42,"{'class_weight': 'balanced', 'max_depth': 5, '...",0.948944,0.97007,0.964789,0.977113,0.984155,0.969014,0.011972,7
21,balanced_subsample,5,auto,0.05,50,1,42,0.14169,0.009259,0.007735,0.000381,balanced_subsample,5,auto,0.05,50,1,42,"{'class_weight': 'balanced_subsample', 'max_de...",0.955986,0.959507,0.971831,0.966549,0.987676,0.96831,0.011135,8
20,balanced_subsample,5,0.3,0.05,500,1,42,1.410325,0.016891,0.060058,0.000933,balanced_subsample,5,0.3,0.05,500,1,42,"{'class_weight': 'balanced_subsample', 'max_de...",0.948944,0.966549,0.964789,0.977113,0.984155,0.96831,0.011993,8
9,balanced,5,auto,0.05,50,1,42,0.095465,0.002156,0.007429,0.000108,balanced,5,auto,0.05,50,1,42,"{'class_weight': 'balanced', 'max_depth': 5, '...",0.955986,0.952465,0.97007,0.971831,0.987676,0.967606,0.012578,10


# Using the best results of the Hyper Parameter Search on the Test Data

In [29]:
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve

clf1 = RandomForestClassifier(**clf.best_params_) #defining the algorithm
#Fitting the Data
clf1.fit(X_train.select_dtypes(include=num), y_train)


#ROC Curve
RF_roc_auc = roc_auc_score(y_test, clf1.predict(X_test.select_dtypes(include=num)))
fpr, tpr, thresholds = roc_curve(y_test, clf1.predict_proba(X_test.select_dtypes(include=num))[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest (area = %0.2f)' % RF_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('RF_ROC')
plt.show()

#defining the predicted data and the list for the label names
y_pred = clf1.predict(X_test.select_dtypes(include=num))
label_names=['Stayed','Attrited']

#Creating the confustion matrix
cf_matrix = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, yticklabels=label_names, xticklabels=label_names, fmt="g")

#Creating confustion matrix for the train set
#cf1_matrix = metrics.confusion_matrix(y_train, clf1.predict(X_train.select_dtypes(include=num)))
#sns.heatmap(cf1_matrix, annot=True, yticklabels=label_names, xticklabels=label_names, fmt="g")

#Creating the full statistical report for the train set   
print(classification_report(y_test, y_pred, target_names=label_names))

# #Classification report for the test set
# print(classification_report(y_train, clf1.predict(X_train.select_dtypes(include=num))))

NameError: ignored

# Scatter Plot of Feature Importance Determined from RF Model

In [None]:
# Scatter plot 
trace = go.Scatter(
    y = clf1.feature_importances_,
    x = surveyPd.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = clf1.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = surveyPd.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    titlefont=dict(
            size=40),
    hovermode= 'closest',
     xaxis= dict(
         title = 'Question Number',
         titlefont=dict(size=30),
         ticklen= 1,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        titlefont=dict(size=30),
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatterrf1')
fig.show(renderer="colab")

#Ranking of important questions for RF

In [None]:
#Here are the most important questions according to the RF algorithm

rf_importances = clf1.feature_importances_
rf_index = surveyPd.select_dtypes(include=num).columns.values 
#new_index = np.delete(rf_index, -1)

featureimportance_df = pd.DataFrame({'Question Number': rf_index, 'Feature Importance': rf_importances}, columns=['Question Number', 'Feature Importance'])
fi_df = featureimportance_df.sort_values(by=['Feature Importance'], ascending=False)
fi_df

top_five = fi_df.head(n=5)
print(top_five)

question_column = top_five.loc[:,'Question Number']

imp_questions = question_column.values

#returning the 5 most important questions

print("The most important questions are: ")

for i in imp_questions:
  print(questions[int(i)])



## Hyper-Parameter Search for RF (K-Fold Cross Validation)



In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

#scores = cross_val_score(clf, surveyPd.loc[:,0:38], surveyPd[39], cv=5)
scores = cross_val_score(clf1, X_train.select_dtypes(include=num), y_train,cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
# from sklearn.model_selection import cross_val_predict
# predicted = cross_val_predict(clf1, surveyPd.loc[:,0:38], surveyPd[39], cv=10)
# predicted

# Hyper Parameter Search using Sklearn Grid Search Method for Gradient Boost

In [None]:
#Trying out Gradient Boosting
#importing
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn import metrics

#setting the gradient boosting parameter -- come back later if we need to change anything other than n_estimators
gbm = GradientBoostingClassifier()

# Hyper Parameter Search using Sklearn Grid Search Method for Gradient Boost

In [None]:
from sklearn.model_selection import GridSearchCV
gb_parameters = {
    "n_estimators":[50,100], 
    'max_features':['auto',0.3],
    "max_depth":[1,3,5],
    "random_state":[42],
    "learning_rate":[0.1,0.25],
    "loss": ['deviance', 'exponential']
}

gbm1 = GridSearchCV(gbm, gb_parameters)
gbm1.fit(X_train.select_dtypes(include=num),y_train)

# Gradient Boost Hyper Parameter Search Results

In [None]:
gbm_results = pd.DataFrame.from_dict(gbm1.cv_results_)
newdf = pd.DataFrame.from_dict(gbm1.cv_results_['params'])
result = pd.concat([newdf,gbm_results], axis=1, join='inner')
result.sort_values('rank_test_score', ascending=True)

# Fitting the best parameters on test data using Gradient Boost

In [None]:
from sklearn import metrics
#Fitting the model
gbm2 = GradientBoostingClassifier(**gbm1.best_params_)
gbm2.fit(X_train.select_dtypes(include=num), y_train)

#Predicting based on the test set
y_grad_predicted = gbm2.predict(X_test.select_dtypes(include=num))

#printing full metrics report
#print(metrics.classification_report(y_test, y_grad_predicted))
print(classification_report(y_train, gbm2.predict(X_train.select_dtypes(include=num))))     
print(classification_report(y_test, gbm2.predict(X_test.select_dtypes(include=num))))


GB_roc_auc = roc_auc_score(y_test, gbm2.predict(X_test.select_dtypes(include=num)))
fpr, tpr, thresholds = roc_curve(y_test, gbm2.predict_proba(X_test.select_dtypes(include=num))[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Gradient Boost (area = %0.2f)' % GB_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Gradient Boost_ROC')
plt.show()

print('Confusion Matrix - Training Dataset')
print(pd.crosstab(y_test.ravel(), y_grad_predicted, rownames = ['True'], colnames = ['Predicted'], margins = True))

# Feature Importance Visual as Determined by Gradient Boost Model

In [None]:
# Scatter plot 
trace = go.Scatter(
    y = gbm2.feature_importances_,
    x = surveyPd.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = gbm2.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = surveyPd.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boost Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 1,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scattergb')
fig.show(renderer="colab")

In [None]:
#Here are the most important questions according to the GB algorithm

gb_importances = gbm2.feature_importances_
gb_index = surveyPd.select_dtypes(include=num).columns.values 
#new_index = np.delete(rf_index, -1)

featureimportance_df = pd.DataFrame({'Question Number': gb_index, 'Feature Importance': gb_importances}, columns=['Question Number', 'Feature Importance'])
fi_df = featureimportance_df.sort_values(by=['Feature Importance'], ascending=False)
fi_df

top_five = fi_df.head(n=5)
print(top_five)

question_column = top_five.loc[:,'Question Number']

imp_questions = question_column.values

#returning the 5 most important questions

#print(imp_questions)
print('The questions are:')
for i in imp_questions:
  print(questions[int(i)])



# K Fold Cross Validation on the Gradient Boost Model

In [None]:
scores = cross_val_score(gbm2, X_train.select_dtypes(include=num), y_train,cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(gbm2, X_train.select_dtypes(include=num), y_train, cv=10)
predicted

# Hyper Parameter Search using Sklearn Grid Search Method for Xtreme Gradient Boost

In [None]:
from xgboost import XGBClassifier

xgm = XGBClassifier()

#try a more different learning rates and other parameter values
xgm_parameters = {
    "n_estimators":[50,100,200], 
    'max_features':[1, 10,'auto','sqrt'],
    "max_depth":[1,3,5],
    "random_state":[42],
    "learning_rate":[0.05,0.1,0.25],
    
}
xgm1 = GridSearchCV(xgm, xgm_parameters)
xgm1.fit(X_train.select_dtypes(include=num),y_train.values.ravel())

# Hyper Parameter Experiment Results using XGB

In [None]:
xgm_results = pd.DataFrame.from_dict(xgm1.cv_results_)
newdf = pd.DataFrame.from_dict(xgm1.cv_results_['params'])
result = pd.concat([newdf,xgm_results], axis=1, join='inner')
result.sort_values('rank_test_score', ascending=True)

# Testing Best Parameters from HP search on Test Data using XGB Model

In [None]:
#XGboost
from xgboost import XGBClassifier
xgm2 = XGBClassifier(**xgm1.best_params_)

xgm2.fit(X_train.select_dtypes(include=num), y_train)

y_xg_predicted = xgm2.predict(X_test.select_dtypes(include=num))

#print(metrics.classification_report(y_test, y_xg_predicted))
print(classification_report(y_train, xgm2.predict(X_train.select_dtypes(include=num))))     
print(classification_report(y_test, xgm2.predict(X_test.select_dtypes(include=num))))

XGB_roc_auc = roc_auc_score(y_test, xgm2.predict(X_test.select_dtypes(include=num)))
fpr, tpr, thresholds = roc_curve(y_test, xgm2.predict_proba(X_test.select_dtypes(include=num))[:,1])
plt.figure()
plt.plot(fpr, tpr, label='X Gradient Boost (area = %0.2f)' % XGB_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('XGB_ROC')
plt.show()

print('Confusion Matrix - Training Dataset')
print(pd.crosstab(y_test.ravel(), y_xg_predicted, rownames = ['True'], colnames = ['Predicted'], margins = True))

# Feature Importance Visual from XGB Model

In [None]:
# Scatter plot 
trace = go.Scatter(
    y = xgm2.feature_importances_,
    x = surveyPd.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 13,
        #size= rf.feature_importances_,
        #color = np.random.randn(500), #set color equal to a variable
        color = xgm2.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = surveyPd.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'XG Boost Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 1,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatterxg')
fig.show(renderer="colab")

In [None]:
#Here are the most important questions according to the RF algorithm

xg_importances = xgm2.feature_importances_
xg_index = surveyPd.select_dtypes(include=num).columns.values 
#new_index = np.delete(rf_index, -1)

featureimportance_df = pd.DataFrame({'Question Number': xg_index, 'Feature Importance': xg_importances}, columns=['Question Number', 'Feature Importance'])
fi_df = featureimportance_df.sort_values(by=['Feature Importance'], ascending=False)
fi_df

top_five = fi_df.head(n=5)
print(top_five)

question_column = top_five.loc[:,'Question Number']

imp_questions = question_column.values

#returning the 5 most important questions

print(imp_questions)

for i in imp_questions:
  print(questions[int(i)])



# K Fold Cross Validation on the XGB Model

In [None]:
scores = cross_val_score(xgm2, X_train.select_dtypes(include=num), y_train,cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(xgm2, X_train.select_dtypes(include=num), y_train, cv=10)
predicted

# AUTO KERAS MODELLING

# Using Auto Keras to build a Classification Model


In [None]:

#AutoKeras
#Try this with a different loss function

import logging
tf.get_logger().setLevel(logging.ERROR)

from sklearn.metrics import classification_report, confusion_matrix
import autokeras as ak


#fitting the model
alf = ak.StructuredDataClassifier(max_trials=5, loss="binary_crossentropy")
alf.fit(X_train,y_train, verbose=1, epochs=10)

alf_eval = alf.evaluate(X_train, y_train)
alf_eval


#alf_predict = alf.predict(X_test)
#cm = confusion_matrix(y_test,alf_predict)
#sns.heatmap(cm, annot=True)

model = alf.export_model()
model.summary()


Keras

# Class Weight Experiment with Keras

In [None]:
#checking the shape of our data

print(X_train.shape)
print(y_train.shape)

In [None]:
counts = np.bincount(y_train)
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(y_train)
    )
)

#Determining the weight needed for each class
##This could be changed if we wished to weight the attriting class even more strongly
weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

print("Weight for neg class", weight_for_0)
print("Weight for attrit class", weight_for_1)

In [None]:
#trying regular keras based on this credit card fraud example- https://keras.io/examples/structured_data/imbalanced_classification/
#class weight serves to make the loss function pay more attention to a miss
from tensorflow import keras

#defining the model
model = keras.Sequential(
    [
        keras.layers.Dense(
            256, activation="relu", input_shape=(X_train.shape[-1],)
        ),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()


In [None]:
from sklearn.utils import class_weight

metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("atrrit_model_at_epoch_{epoch}.h5")]
class_weight = {0:weight_for_0, 1:weight_for_1} #adjust this to determine the penalty for missing

model.fit(X_train,y_train,
    batch_size=2048,
    epochs=10,
    verbose=2,
    callbacks=callbacks,
    validation_data=(X_test, y_test),
    class_weight=class_weight

)


In [None]:
# np.set_printoptions(precision=4, suppress=True)
from sklearn import metrics
# eval_results = model.evaluate(X_test, y_test, verbose=0) 

# print("\nLoss, accuracy on test data: ")
# print("%0.4f, %0.2f%%" % (eval_results[0], \
#   eval_results[1]*100))

keras_predicted = model.predict(X_test)


y_pred = np.argmax(keras_predicted, axis=1)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

print('Classification Report')
target_names = ['Stay','Attrit']
print(classification_report(y_test, y_pred, target_names=target_names))

#print(metrics.classification_report(y_test, keras_predicted))


# Appendix



```
# This is formatted as code
```

Questions that were asked to the recipients of Client #1's survey

In [None]:
q_cols = ['general empl satisfaction',
 'recommend empl unit to friend',
 'motivation of direct colleagues you are working with day to day? *Top 3',
 'My direct colleagues are coping well with the change and transformation in our division/function',
 'People in my team frequently go above and beyond the requirements of the job',
 'I am proud working for my company and gladly tell people about it',
 'I believe strongly in and support the future direction of my company',
 'I trust that my company takes action that balances the best interests of our people, business and clients',
 "My company's handling of this year’s challenges leaves me confident in our future success",
 "My company's future direction leaves me confident about my career opportunities here",
 'In my team we continuously use client feedback to improve our products/services',
 'In my team we have an ongoing dialogue about our clients and their requirements and expectations',
 'Compared to last year, our clients now view us as:',
 'My immediate manager creates an atmosphere of openness and trust',
 'I have confidence in my immediate manager',
 'I have confidence in the global senior management of [division]',
 'I have confidence in the senior management of [FUNCTION: Finance, HR, Technology, Operations, Marketing]',
 'I have confidence in my local/country senior management',
 'As a member of [FUNCTION: Finance, HR, Technology, Operations, Marketing] I am familiar with the overall objectives & strategies of my function',
 'I am familiar with the overall objectives & strategies of [division]',
 'I can clearly see how my own work contributes to the overall objectives & strategies of [division]',
 'In my team we work towards clear objectives',
 'In my team we make decisions rapidly when it is necessary',
 'In my team actions are taken quickly when decisions have been made',
 'Please rate the cooperation between different units within [division]',
 'Please rate the cooperation within [FUNCTION: Finance, HR, Technology, Operations, Marketing]',
 'Please rate the cooperation across the company as a whole',
 'In my team new ideas receive very strong support and encouragement',
 'I feel I can make my own decisions concerning my work',
 "I have seen action taken based on the results of last year's survey",
 'In my company diversity of skills, experiences, background and ways of working are recognized and appreciated',
 'My immediate manager treats all team members fairly, regardless of age, gender identity, sex, family status, race, national origin, nationality, religion, disability or sexual orientation',
 'My immediate manager effectively works with people who are different from them to achieve business results',
 'In my team we treat each other fairly, regardless of age, sex, gender identity, family status, race, national origin, nationality, religion, disability or sexual orientation',
 'I feel included in my team',
 'My company takes an interest in my well-being',
 'My work schedule allows me sufficient flexibility to meet my personal/family needs',
 'Career opportunities always go to the most qualified person regardless of age, gender identity, sex, family status, race, national origin, nationality, religion, disability or sexual orientation',
 'Is there anything else you want to share with us?']

#Catboost

In [None]:
#one hot encoding is built into this model. do not need to use it in the preprocessing
from catboost import CatBoostClassifier, Pool
from sklearn import metrics
from catboost.utils import get_confusion_matrix

#train the model

train_dataset = Pool(data=X_train,
                     label=y_train)

eval_dataset = Pool(data=X_test,
                    label=y_test)


cbm = CatBoostClassifier(verbose=200, loss_function='Logloss', eval_metric='AUC',random_seed=42)

cbm.fit(train_dataset, use_best_model=True, eval_set=eval_dataset, plot=True)


cat_predict = cbm.predict(X_test)

print(cat_predict)

cm1 = get_confusion_matrix(cbm, Pool(X_train, y_train))
print(cm1)

cm2 = get_confusion_matrix(cbm, Pool(X_test, y_test))
print(cm2)

print(cbm.get_best_score())
print(cbm.get_best_iteration())

print("Count of trees in model = {}".format(cbm.tree_count_))


#Hyper Parameter Search using Sklearn Grid Search Method for SVM

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
svm_parameters = {'kernel':('linear', 'rbf'), 'C':[1,5]} #added a class weight
svc = svm.SVC()
clf2 = GridSearchCV(svc, svm_parameters)
clf2.fit(X_train, y_train)

# Hyper Parameter Results for SVM

---



In [None]:
svm_results = pd.DataFrame.from_dict(clf2.cv_results_)
newdf = pd.DataFrame.from_dict(clf2.cv_results_['params'])
result = pd.concat([newdf,svm_results], axis=1, join='inner')
result

# Best Parameters Chosen for SVM on Test Data

In [None]:
from sklearn import svm
clf3 = svm.SVC(**clf2.best_params_).fit(X_train, y_train)
clf3.score(X_test, y_test)
print(classification_report(y_train, clf3.predict(X_train)))
      
print(classification_report(y_test, clf3.predict(X_test)))

plot_roc_curve(clf3, X_test, y_test)
print('Confusion Matrix - Training Dataset')
print(pd.crosstab(y_test.ravel(),  clf3.predict(X_test), rownames = ['True'], colnames = ['Predicted'], margins = True))

# K-Fold Cross Validation of SVM Model

In [None]:
scores = cross_val_score(clf3, X_train, y_train,cv=5)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(clf3, X_train, y_train, cv=10)
predicted