# Classification tree

In [65]:
from sklearn import tree
from sklearn import grid_search

parameters = {'max_depth': list(range(3,21))}
clf = grid_search.GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=2, cv = 10)
clf.fit(X, y)
tree_model = clf.best_estimator_
print(clf.best_score_, clf.best_params_) 

Xpred = preprocess_for_sklearn(test.drop(["left"], axis=1))
ypred = test.left

0.9807843417386957 {'max_depth': 20}


In [66]:
tree_model = tree.DecisionTreeClassifier(max_depth = 18)
tree_model = tree_model.fit(X, y)

predicted_labels = tree_model.predict(Xpred)

print(accuracy_score(ypred, predicted_labels))
tree_model.predict_proba(Xpred)

0.997


array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [15]:
import pandas as pd
from sklearn import tree
from sklearn.metrics import accuracy_score

In [5]:
def preprocess_for_sklearn(data):
    data.columns = map(str.lower, data.columns)
    data.sales = data.sales.str.lower()
    # create dummies for categorical variables
    data = data.join(pd.get_dummies(data["sales"], prefix="dept"))
    data = data.join(pd.get_dummies(data["salary"], prefix="salary"))
    # drop variables that should not be in the X matrix
    # these include: employer ID, categorical variables that have
    # been converted into dummies and one dummy per each
    # categorical variable (to avoid perfect multicollinearity)
    data = data.drop(["emp_id", "salary_high", "dept_accounting",
                      "sales", "salary"], axis=1)
    # the test data has an extra column compared to the train data
    # it's the name of the employee and it has to be dropped
    try:
        data = data.drop(["name"], axis=1)
    except ValueError:
        pass
    return data

In [9]:
os.listdir('../Data')

['.DS_Store', 'turnover.csv', 'random-names.csv', 'train.csv']

In [25]:
data = pd.read_csv('../Data/train.csv')
data = data.drop(['Unnamed: 0'], axis=1)
print(data.shape)
test = pd.read_csv('../Data/test.csv')
test['emp_ID'] = 1
print(test.shape)

(13999, 11)
(1000, 11)


In [26]:
X = preprocess_for_sklearn(data.drop(["left"], axis=1))
y = data.left
X.shape

(13999, 18)

In [41]:
clf = tree.DecisionTreeClassifier(max_depth = 5)
clf = clf.fit(X, y)

In [42]:
Xpred = preprocess_for_sklearn(test.drop(["left"], axis=1))
ypred = test.left
Xpred.shape
predicted_labels = clf.predict(Xpred)
clf.predict_proba(Xpred)

(1000, 18)

array([[0.02133517, 0.97866483],
       [0.98824403, 0.01175597],
       [0.98824403, 0.01175597],
       ...,
       [0.96996997, 0.03003003],
       [0.08179724, 0.91820276],
       [0.        , 1.        ]])

# Logistic regression

In [6]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math
import pickle
pd.options.display.max_columns = 25

In [14]:
def read_data(table_name):
    engine = create_engine(config.database_config)
    sql = "select * from " + table_name
    data = pd.read_sql_query(sql, con = engine)
    try:
        y = data.left
        data = data.drop(["left"], axis = 1)
        return (data, y)
    except:
        return data

def preprocess_for_sklearn(data):
    data.columns = map(str.lower, data.columns)
    data.sales = data.sales.str.lower()
    #create dummies for categorical variables
    data = data.join(pd.get_dummies(data["sales"], prefix="dept"))
    data = data.join(pd.get_dummies(data["salary"], prefix="salary"))
    #drop variables that should not be in the X matrix
    #these include: left (ie the target variable), employer ID, categorical vars and
    #one dummy per category (to avoid perfect multicollinearity)
    data = data.drop(["emp_id", "salary_high", "dept_accounting", "sales", "salary"], axis = 1)
    try:
        data = data.drop(["name"], axis = 1)
    except:
        pass
    return data

In [15]:
data = read_data('employees_hist_data')
len(data[0].columns)

10

In [16]:
data = read_data('employees_eval_jan18')
len(data.columns)

11

## Playground

##### Next block does prediction using scikitlearn, should match results below

In [17]:
X, y = read_data('employees_hist_data')
X = preprocess_for_sklearn(X)
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X.loc[0:1,:])
y_pred_prob = logreg.predict_proba(X.loc[0:10,:])[:, 1]
print(y_pred)
print(y_pred_prob)

[0 0]
[ 0.1154772   0.01001855  0.35615017  0.60428624  0.06932601  0.60068867
  0.45953769  0.17723004  0.03859902  0.09246403  0.09077333]


In [18]:
a = list(X.loc[2,:])
test = np.array([a])
y_pred = logreg.predict_proba(test)[:, 1][0]
y_pred

0.35615016853537901

## Check everything worked nicely with pickle approach

##### Predict using scikitlearn on a single observation

In [19]:
pkl_filename = '../models/logistic.pkl'
model_pkl = open(pkl_filename, 'rb')
model = pickle.load(model_pkl)
model_pkl.close()

In [20]:
a = list(X.loc[2,:])
a[6] = 1
a[16] = 0
a[17] = 0
test = np.array([a])
y_pred = model.predict_proba(test)[:, 1][0]
y_pred
#60% when low
#46.8% when medium
#20.8% when high

0.030291716574721447

In [21]:
X.loc[2,:]

satisfaction_level         0.41
last_evaluation            0.50
number_project             2.00
average_montly_hours     128.00
time_spend_company         3.00
work_accident              0.00
promotion_last_5years      0.00
dept_hr                    0.00
dept_it                    0.00
dept_management            1.00
dept_marketing             0.00
dept_product_mng           0.00
dept_randd                 0.00
dept_sales                 0.00
dept_support               0.00
dept_technical             0.00
salary_low                 1.00
salary_medium              0.00
Name: 2, dtype: float64

Test the app with this data:
- [0.57, 0.82, 4, 269, 2, False, False, Technical, Medium] ==> 21.30%
- [0.97, 0.61, 4, 262, 3, True, True, Marketing, Medium] ==> 0.52%

In [17]:
a[6] = 0
a

[0.96999999999999997,
 0.60999999999999999,
 4.0,
 262.0,
 3.0,
 1.0,
 0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 262.0,
 3.0]