In [43]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math
import pickle
pd.options.display.max_columns = 25

In [44]:
def read_data():
    engine = create_engine(config.database_config)
    emp = pd.read_sql_query("select * from employees_hist_data", con = engine)
    y = emp['left']
    emp.sales = emp.sales.str.lower()
    emp = emp.join(pd.get_dummies(emp["sales"], prefix="dept"))
    emp = emp.join(pd.get_dummies(emp["salary"], prefix="salary"))
    emp = emp.drop(["left", "emp_ID", "salary_high", "dept_accounting", "sales", "salary"], axis = 1)
    emp["interaction_promotion_hours"] = emp.promotion_last_5years * emp.average_montly_hours
    emp["interaction_promotion_tenure"] = emp.promotion_last_5years * emp.time_spend_company
    return (emp, y)

In [45]:
def fit_model_pickle(data):
    X = data[0]
    y = data[1]
    logreg = LogisticRegression()
    logreg.fit(X, y)
    #pickle the model
    pkl_filename = '../models/logistic.pkl'
    model_pkl = open(pkl_filename, 'wb')
    pickle.dump(logreg, model_pkl)
    model_pkl.close()

In [35]:
if __name__ == "__main__":
    fit_model_pickle(read_data())

## Playground

##### Next block does prediction using scikitlearn, should match results below

In [46]:
X, y = read_data()
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X.loc[0:1,:])
y_pred_prob = logreg.predict_proba(X.loc[0:1,:])[:, 1]
print(y_pred)
print(y_pred_prob)

[0 0]
[ 0.11570227  0.00842564]


## Check everything worked nicely with pickle approach

##### Predict using scikitlearn on a single observation

In [47]:
pkl_filename = '../models/logistic.pkl'
model_pkl = open(pkl_filename, 'rb')
model = pickle.load(model_pkl)
model_pkl.close()

In [57]:
a = list(X.loc[61,:]) #this row has a positive interaction
test = np.array([a])
y_pred = model.predict_proba(test)[:, 1][0]
y_pred

0.0051892727692853698