In [1]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math
import pickle
pd.options.display.max_columns = 25

In [2]:
def read_data(table_name):
    engine = create_engine(config.database_config)
    sql = "select * from " + table_name
    emp = pd.read_sql_query(sql, con = engine)
    return emp

def preprocess_for_sklearn(data):
    y = data['left']
    data.columns = map(str.lower, data.columns)
    data.sales = data.sales.str.lower()
    data = data.join(pd.get_dummies(data["sales"], prefix="dept"))
    data = data.join(pd.get_dummies(data["salary"], prefix="salary"))
    data = data.drop(["left", "emp_id", "salary_high", "dept_accounting", "sales", "salary"], axis = 1)
    try:
        data = data.drop(["name"], axis = 1)
    except:
        pass
    data["interaction_promotion_hours"] = data.promotion_last_5years * data.average_montly_hours
    data["interaction_promotion_tenure"] = data.promotion_last_5years * data.time_spend_company
    return (data, y)

In [3]:
if __name__ == "__main__":
    #fit_model_pickle(read_data())
    print("a")

a


## Playground

##### Make predictions

In [3]:
trainX, trainY = preprocess_for_sklearn(read_data("employees_hist_data"))
trainX.shape

(13999, 20)

In [15]:
X = read_data("employees_new_data")
testX, testY = preprocess_for_sklearn(X)
print(X.shape) ##
print(testX.shape) #should be 20 columns

(1000, 12)
(1000, 20)


In [16]:
logreg = LogisticRegression()
logreg.fit(trainX, trainY)
y_pred_prob = logreg.predict_proba(testX)[:,1]
y_pred_prob

array([ 0.05498832,  0.46337309,  0.03728143,  0.03362248,  0.17921642,
        0.15289821,  0.06380409,  0.01936548,  0.48862227,  0.11757197,
        0.01407611,  0.20986292,  0.16373877,  0.49802901,  0.01804983,
        0.03430359,  0.73060034,  0.27521771,  0.70054536,  0.41576207,
        0.19707009,  0.09141941,  0.46593762,  0.02440954,  0.34640849,
        0.68246706,  0.29395368,  0.11854759,  0.45024026,  0.00410469,
        0.16254216,  0.00459151,  0.24868572,  0.03782671,  0.80140354,
        0.01112243,  0.12760658,  0.2106953 ,  0.13936907,  0.33487866,
        0.01643811,  0.2154735 ,  0.11922153,  0.38095633,  0.62213785,
        0.34315901,  0.16409795,  0.72384333,  0.15474204,  0.08310823,
        0.12247275,  0.09676831,  0.28944259,  0.18697845,  0.05786593,
        0.08154415,  0.57827353,  0.09195722,  0.08504352,  0.06097026,
        0.04297368,  0.17064764,  0.0523079 ,  0.12100497,  0.28039716,
        0.11612801,  0.45963285,  0.34916696,  0.47287211,  0.28

##### Just check that predictions are the same using pickled model

In [17]:
pkl_filename = '../models/logistic.pkl'
model_pkl = open(pkl_filename, 'rb')
model = pickle.load(model_pkl)
model_pkl.close()

In [18]:
y_pred = pd.DataFrame({"phat" : model.predict_proba(testX)[:,1]})
X.head(1)

Unnamed: 0,emp_id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,sales,salary,name
0,14013,0.9,0.56,3,151,3,0,0,0,sales,medium,Adeline Powell


In [19]:
X = X.join(y_pred)

In [27]:
X = X.sort_values(by='phat', ascending = False)

In [28]:
X

Unnamed: 0,emp_id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,sales,salary,name,phat
170,1418,0.11,0.96,6,301,5,0,1,0,technical,low,Angela Moyer,0.818898
291,798,0.11,0.91,5,291,4,0,1,0,technical,low,Benjamin Pierce,0.813738
583,4684,0.12,0.70,4,276,4,0,0,0,sales,low,Lauren Keller,0.808982
34,11197,0.22,0.70,2,274,10,0,0,0,sales,high,Cesar Yoder,0.801404
859,876,0.11,0.94,6,277,5,0,1,0,technical,low,Viviana Allison,0.800949
440,2500,0.13,0.67,3,181,4,0,0,0,technical,low,Shelby Walsh,0.799015
347,3537,0.13,0.91,2,149,5,0,0,0,sales,medium,Keira Cardenas,0.794271
616,9040,0.15,0.40,3,236,5,0,0,0,hr,medium,Jacob Leach,0.780778
261,1806,0.10,0.87,6,254,5,0,1,0,support,low,Leland Ortiz,0.777769
485,12158,0.11,0.92,6,305,4,0,1,0,technical,low,Gianni Dougherty,0.773995


In [5]:
trainX.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,promotion_last_5years,dept_hr,dept_it,dept_management,dept_marketing,dept_product_mng,dept_randd,dept_sales,dept_support,dept_technical,salary_low,salary_medium,interaction_promotion_hours,interaction_promotion_tenure
count,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0,13999.0
mean,0.612619,0.716351,3.800843,201.06479,3.505322,0.143939,0.02143,0.049575,0.082006,0.042289,0.058576,0.059861,0.052289,0.27702,0.146868,0.180727,0.490392,0.426816,4.286163,0.090078
std,0.248617,0.171347,1.231543,49.977392,1.463207,0.35104,0.144818,0.217073,0.274383,0.201254,0.234837,0.237238,0.222618,0.447542,0.353987,0.384806,0.499926,0.494633,29.812827,0.687769
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,292.0,10.0
