In [1]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math
import pickle
pd.options.display.max_columns = 25

## Reproduce some functions that we will need

In [2]:
def read_data(table_name):
    engine = create_engine(config.database_config)
    sql = "select * from " + table_name
    emp = pd.read_sql_query(sql, con = engine)
    return emp

def preprocess_for_sklearn(data):
    y = data['left']
    data.columns = map(str.lower, data.columns)
    data.sales = data.sales.str.lower()
    data = data.join(pd.get_dummies(data["sales"], prefix="dept"))
    data = data.join(pd.get_dummies(data["salary"], prefix="salary"))
    data = data.drop(["left", "emp_id", "salary_high", "dept_accounting", "sales", "salary"], axis = 1)
    try:
        data = data.drop(["name"], axis = 1)
    except:
        pass
    data["interaction_promotion_hours"] = data.promotion_last_5years * data.average_montly_hours
    data["interaction_promotion_tenure"] = data.promotion_last_5years * data.time_spend_company
    return (data, y)

## Make predictions

In [10]:
X = read_data("employees_new_data")
testX = preprocess_for_sklearn(X)[0]
print(X.shape) ##
print(testX.shape) #should be 20 columns

(1000, 12)
(1000, 20)


In [11]:
pkl_filename = '../models/logistic.pkl'
model_pkl = open(pkl_filename, 'rb')
model = pickle.load(model_pkl)
model_pkl.close()

In [12]:
y_pred = pd.DataFrame({"phat" : model.predict_proba(testX)[:,1]})
X = X.join(y_pred)

In [13]:
X = X.sort_values(by='phat', ascending = False)

In [15]:
X.head(1)

Unnamed: 0,emp_id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,sales,salary,name,phat
170,1418,0.11,0.96,6,301,5,0,1,0,technical,low,Angela Moyer,0.818898


In [16]:
X.head(2).to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>emp_id</th>\n      <th>satisfaction_level</th>\n      <th>last_evaluation</th>\n      <th>number_project</th>\n      <th>average_montly_hours</th>\n      <th>time_spend_company</th>\n      <th>work_accident</th>\n      <th>left</th>\n      <th>promotion_last_5years</th>\n      <th>sales</th>\n      <th>salary</th>\n      <th>name</th>\n      <th>phat</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>170</th>\n      <td>1418</td>\n      <td>0.11</td>\n      <td>0.96</td>\n      <td>6</td>\n      <td>301</td>\n      <td>5</td>\n      <td>0</td>\n      <td>1</td>\n      <td>0</td>\n      <td>technical</td>\n      <td>low</td>\n      <td>Angela Moyer</td>\n      <td>0.818898</td>\n    </tr>\n    <tr>\n      <th>291</th>\n      <td>798</td>\n      <td>0.11</td>\n      <td>0.91</td>\n      <td>5</td>\n      <td>291</td>\n      <td>4</td>\n      <td>0</td>\n      <td>1</

#### Earlier check to make sure everything was working the same using pickle or not

In [None]:
trainX, trainY = preprocess_for_sklearn(read_data("employees_hist_data"))
trainX.shape

In [None]:
logreg = LogisticRegression()
logreg.fit(trainX, trainY)
y_pred_prob = logreg.predict_proba(testX)[:,1]
y_pred_prob[0:5]