In [22]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math
import pickle
pd.options.display.max_columns = 25

In [45]:
def read_data(table_name):
    engine = create_engine(config.database_config)
    sql = "select * from " + table_name
    emp = pd.read_sql_query(sql, con = engine)
    return emp

def preprocess_for_sklearn(data):
    y = data['left']
    data.columns = map(str.lower, data.columns)
    data.sales = data.sales.str.lower()
    data = data.join(pd.get_dummies(data["sales"], prefix="dept"))
    data = data.join(pd.get_dummies(data["salary"], prefix="salary"))
    data = data.drop(["left", "emp_id", "salary_high", "dept_accounting", "sales", "salary"], axis = 1)
    try:
        data = data.drop(["name"], axis = 1)
    except:
        pass
    #data["interaction_promotion_eval"] = data.promotion_last_5years * data.last_evaluation
    #data["interaction_promotion_tenure"] = data.promotion_last_5years * data.time_spend_company
    return (data, y)

## Playground

##### Next block does prediction using scikitlearn, should match results below

In [76]:
data = read_data('employees_hist_data')
X, y = preprocess_for_sklearn(data)
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X.loc[0:1,:])
y_pred_prob = logreg.predict_proba(X.loc[0:10,:])[:, 1]
print(y_pred)
print(y_pred_prob)

[0 0]
[ 0.1154772   0.01001855  0.35615017  0.60428624  0.06932601  0.60068867
  0.45953769  0.17723004  0.03859902  0.09246403  0.09077333]


In [78]:
a = list(X.loc[2,:])
test = np.array([a])
y_pred = logreg.predict_proba(test)[:, 1][0]
y_pred

0.35615016853537901

## Check everything worked nicely with pickle approach

##### Predict using scikitlearn on a single observation

In [50]:
pkl_filename = '../models/logistic.pkl'
model_pkl = open(pkl_filename, 'rb')
model = pickle.load(model_pkl)
model_pkl.close()

In [82]:
a = list(X.loc[2,:])
a[6] = 1
a[16] = 0
a[17] = 0
test = np.array([a])
y_pred = model.predict_proba(test)[:, 1][0]
y_pred
#60% when low
#46.8% when medium
#20.8% when high

0.030291716574721447

In [80]:
X.loc[2,:]

satisfaction_level         0.41
last_evaluation            0.50
number_project             2.00
average_montly_hours     128.00
time_spend_company         3.00
work_accident              0.00
promotion_last_5years      0.00
dept_hr                    0.00
dept_it                    0.00
dept_management            1.00
dept_marketing             0.00
dept_product_mng           0.00
dept_randd                 0.00
dept_sales                 0.00
dept_support               0.00
dept_technical             0.00
salary_low                 1.00
salary_medium              0.00
Name: 2, dtype: float64

Test the app with this data:
- [0.57, 0.82, 4, 269, 2, False, False, Technical, Medium] ==> 21.30%
- [0.97, 0.61, 4, 262, 3, True, True, Marketing, Medium] ==> 0.52%

In [17]:
a[6] = 0
a

[0.96999999999999997,
 0.60999999999999999,
 4.0,
 262.0,
 3.0,
 1.0,
 0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 262.0,
 3.0]