In [1]:
def preprocess_for_sklearn(data):
    data.columns = map(str.lower, data.columns)
    data.sales = data.sales.str.lower()
    # create dummies for categorical variables
    data = data.join(pd.get_dummies(data["sales"], prefix="dept"))
    data = data.join(pd.get_dummies(data["salary"], prefix="salary"))
    # drop variables that should not be in the X matrix
    # these include: employer ID, categorical variables that have
    # been converted into dummies and one dummy per each
    # categorical variable (to avoid perfect multicollinearity)
    data = data.drop(["emp_id", "salary_high", "dept_accounting",
                      "sales", "salary"], axis=1)
    # the test data has an extra column compared to the train data
    # it's the name of the employee and it has to be dropped
    try:
        data = data.drop(["name"], axis=1)
    except ValueError:
        pass
    return data

In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('../Data/train.csv')
print(data.shape)
test = pd.read_csv('../Data/test.csv')
print(test.shape)

(13999, 11)
(500, 11)


In [3]:
X = preprocess_for_sklearn(data.drop(["left"], axis=1))
y = data.left
print(X.shape)
Xpred = preprocess_for_sklearn(test.drop(["left"], axis=1))
ypred = test.left
print(Xpred.shape)

(13999, 18)
(500, 18)


## Random forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=15, random_state=45, n_estimators=200)
clf.fit(X, y)

predicted_labels = clf.predict(Xpred)

print("Accuracy: " + str(accuracy_score(ypred, predicted_labels)))
print("Recall: " + str(recall_score(ypred, predicted_labels)))

confusion_matrix(ypred, predicted_labels)

Accuracy: 0.988
Recall: 0.9512195121951219


array([[377,   0],
       [  6, 117]])

In [9]:
print(list(X))
clf.feature_importances_

['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'work_accident', 'promotion_last_5years', 'dept_hr', 'dept_it', 'dept_management', 'dept_marketing', 'dept_product_mng', 'dept_randd', 'dept_sales', 'dept_support', 'dept_technical', 'salary_low', 'salary_medium']


array([0.31486599, 0.1165723 , 0.18519465, 0.15299445, 0.19243088,
       0.00913026, 0.00162846, 0.00120084, 0.00145694, 0.00146177,
       0.00092767, 0.00087791, 0.00151864, 0.00278334, 0.00241922,
       0.00306884, 0.00810917, 0.00335868])

In [38]:
predictions = list(clf.predict_proba(Xpred)[:, 1])
predictions
#0.0081, 0.0234, 0.999, 0.0210

[0.008100544128256499,
 0.023456534627407516,
 0.9990197344218668,
 0.021045376238137438,
 0.006014165637143473,
 0.002130667846515974,
 0.0008978126829792089,
 0.0019089629366426002,
 0.9649690280833286,
 0.38003538030481493,
 0.9975055706352188,
 0.002083905385869403,
 0.032414388168441305,
 0.9998491379310345,
 0.011446800094212794,
 0.02057253564281204,
 0.0009451900407798677,
 0.07186700308466848,
 0.013899775990597918,
 0.01042989417989418,
 0.009338454768357657,
 0.0020202919407212917,
 0.9979705037384082,
 0.0004467846840739008,
 0.008599570941262567,
 0.0008567269894048809,
 0.0009302035262482726,
 0.00571038067439078,
 0.0044110465485383155,
 0.005449235629075776,
 0.003305273923969109,
 0.015020851976388245,
 0.034706108890349044,
 0.04418514331420325,
 0.022400884192829738,
 0.03317453295090429,
 0.0004982868404914277,
 0.0018062467475153038,
 0.028321886215422434,
 0.9991217752381932,
 0.00123058291136255,
 0.10534241427722608,
 0.0014065327832014727,
 0.01710421863531836,

In [21]:
n = 285
print(predicted_labels[n])
pred_proba.loc[n,1]

1


0.9987496361344554

In [23]:
test_obs = np.array([Xpred.loc[285,:]])
clf.predict_proba(test_obs)[0][1]

0.9987496361344554

In [32]:
#x = np.array([[0.11, 0.8, 6.0, 285.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]])

#change hours, first of second row(promotion) and last two (salary)
x = np.array([[0.21, 0.8, 6.0, 250.0, 4.0, 0.0,
 1.0, 0.0, 0.0,
 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]])

In [33]:
clf.predict_proba(x)[0][1]

0.035365913818268854

# Classification tree

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

clf = tree.DecisionTreeClassifier(max_depth = 3)
clf = clf.fit(X, y)

predicted_labels = clf.predict(Xpred)

print("Accuracy: " + str(accuracy_score(ypred, predicted_labels)))
print("Recall: " + str(recall_score(ypred, predicted_labels)))

confusion_matrix(ypred, predicted_labels)

In [None]:
print(list(X))
clf.feature_importances_

In [None]:
pred_proba = pd.DataFrame(clf.predict_proba(Xpred))

In [None]:
n = 329
print(predicted_labels[n])
pred_proba.loc[n,1]

In [None]:
test_obs = np.array([Xpred.loc[329,:]])
clf.predict_proba(test_obs)[0][1]

In [None]:
x = np.array([[0.12, 0.6, 4.0, 400.0, 5.0, 0.0,
 0.0, 0.0, 0.0,
 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0]])

In [None]:
clf.predict_proba(x)

In [None]:
#parameters = {'max_depth': list(range(3,21))}
#clf = grid_search.GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=2, cv = 10)
#clf.fit(X, y)
#tree_model = clf.best_estimator_
#print(clf.best_score_, clf.best_params_) 

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

logreg = LogisticRegression()
logreg.fit(X, y)

predicted_labels = logreg.predict(Xpred)

print("Accuracy: " + str(accuracy_score(ypred, predicted_labels)))
print("Recall: " + str(recall_score(ypred, predicted_labels)))

confusion_matrix(ypred, predicted_labels)

## Playground and multiple piles of crap

In [None]:
def read_data(table_name):
    engine = create_engine(config.database_config)
    sql = "select * from " + table_name
    data = pd.read_sql_query(sql, con = engine)
    try:
        y = data.left
        data = data.drop(["left"], axis = 1)
        return (data, y)
    except:
        return data

def preprocess_for_sklearn(data):
    data.columns = map(str.lower, data.columns)
    data.sales = data.sales.str.lower()
    #create dummies for categorical variables
    data = data.join(pd.get_dummies(data["sales"], prefix="dept"))
    data = data.join(pd.get_dummies(data["salary"], prefix="salary"))
    #drop variables that should not be in the X matrix
    #these include: left (ie the target variable), employer ID, categorical vars and
    #one dummy per category (to avoid perfect multicollinearity)
    data = data.drop(["emp_id", "salary_high", "dept_accounting", "sales", "salary"], axis = 1)
    try:
        data = data.drop(["name"], axis = 1)
    except:
        pass
    return data

In [None]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math
import pickle
pd.options.display.max_columns = 25

##### Next block does prediction using scikitlearn, should match results below

In [None]:
X, y = read_data('employees_hist_data')
X = preprocess_for_sklearn(X)
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X.loc[0:1,:])
y_pred_prob = logreg.predict_proba(X.loc[0:10,:])[:, 1]
print(y_pred)
print(y_pred_prob)

In [None]:
a = list(X.loc[2,:])
test = np.array([a])
y_pred = logreg.predict_proba(test)[:, 1][0]
y_pred

## Check everything worked nicely with pickle approach

##### Predict using scikitlearn on a single observation

In [None]:
pkl_filename = '../models/logistic.pkl'
model_pkl = open(pkl_filename, 'rb')
model = pickle.load(model_pkl)
model_pkl.close()

In [None]:
a = list(X.loc[2,:])
a[6] = 1
a[16] = 0
a[17] = 0
test = np.array([a])
y_pred = model.predict_proba(test)[:, 1][0]
y_pred
#60% when low
#46.8% when medium
#20.8% when high

In [None]:
X.loc[2,:]

Test the app with this data:
- [0.57, 0.82, 4, 269, 2, False, False, Technical, Medium] ==> 21.30%
- [0.97, 0.61, 4, 262, 3, True, True, Marketing, Medium] ==> 0.52%

In [None]:
a[6] = 0
a