In [67]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math
import pickle
pd.options.display.max_columns = 25

In [63]:
def read_data():
    engine = create_engine(config.database_config)
    emp = pd.read_sql_query("select * from employees", con = engine)
    y = emp['left']
    emp = emp.drop('left', axis=1)
    emp.sales = emp.sales.str.lower()
    emp = emp.join(pd.get_dummies(emp["sales"], prefix="dept"))
    emp = emp.join(pd.get_dummies(emp["salary"], prefix="salary"))
    emp = emp.drop(["empID", "salary_high", "dept_accounting", "sales", "salary"], axis = 1)
    emp.shape
    return (emp, y)

In [64]:
def fit_model_db(data):
    X = data[0]
    y = data[1]
    logreg = LogisticRegression()
    logreg.fit(X, y)
    b = [logreg.intercept_[0]]
    for x in logreg.coef_[0]:
        b.append(x)
    beta_df = pd.DataFrame({"b_vect" : b})
    engine = create_engine(config.database_config)
    beta_df.to_sql(name = 'betas', con = engine, if_exists='replace', index=False)

In [68]:
def fit_model_pickle(data):
    X = data[0]
    y = data[1]
    logreg = LogisticRegression()
    logreg.fit(X, y)
    #pickle the model
    pkl_filename = '../models/logistic.pkl'
    model_pkl = open(pkl_filename, 'wb')
    pickle.dump(logreg, model_pkl)
    model_pkl.close()

In [69]:
if __name__ == "__main__":
    fit_model_db(read_data())
    fit_model_pickle(read_data())

## Check everything worked nicely with DB approach

In [70]:
X, y = read_data()

In [71]:
engine = create_engine(config.database_config)
b = pd.read_sql_query("select * from betas", con = engine)
b = list(b.b_vect)

In [72]:
a = [1] + [x for x in list(X.loc[0,:])]
logodds = np.inner(a, b)
math.exp(logodds)/(1+math.exp(logodds))

0.5267692106771844

##### Next block does prediction using scikitlearn, should match results from above

In [78]:
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X.loc[0:1,:])
y_pred_prob = logreg.predict_proba(X.loc[0:1,:])[:, 1]
print(y_pred)
print(y_pred_prob)

[1 0]
[ 0.52676921  0.16806627]


## Check everything worked nicely with pickle approach

##### Predict using scikitlearn on a single observation

In [81]:
pkl_filename = '../models/logistic.pkl'
model_pkl = open(pkl_filename, 'rb')
model = pickle.load(model_pkl)
model_pkl.close()

In [82]:
test = np.array([a[1:19]])
y_pred = model.predict_proba(test)[:, 1][0]
y_pred

0.5267692106771843