In [1]:
import config
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.linear_model import LogisticRegression
import math

In [2]:
def read_data():
    engine = create_engine(config.database_config)
    emp = pd.read_sql_query("select * from employees", con = engine)
    y = emp['left']
    emp = emp.drop('left', axis=1)
    cat_vars=['sales','salary']
    for var in cat_vars:
        emp = emp.join(pd.get_dummies(emp[var], prefix=var))
        emp = emp.drop(var, axis = 1)
    emp = emp.drop(["empID", "salary_high", "sales_accounting"], axis = 1)
    return (emp, y)

In [3]:
def fit_model(data):
    X = data[0]
    y = data[1]
    logreg = LogisticRegression()
    logreg.fit(X, y)
    b = [logreg.intercept_[0]]
    for x in logreg.coef_[0]:
        b.append(x)
    beta_df = pd.DataFrame({"b_vect" : b})
    engine = create_engine(config.database_config)
    beta_df.to_sql(name = 'betas', con = engine, if_exists='replace', index=False)

In [4]:
if __name__ == "__main__":
    fit_model(read_data())

## Check everything worked nicely

In [5]:
X, y = read_data()

In [6]:
engine = create_engine(config.database_config)
b = pd.read_sql_query("select * from betas", con = engine)
b = list(b.b_vect)

In [7]:
a = [1] + [x for x in list(X.loc[0,:])]
logodds = np.inner(a, b)
math.exp(logodds)/(1+math.exp(logodds))

0.5267692106766013

##### Next block does prediction using scikitlearn

In [8]:
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X)
y_pred_prob = logreg.predict_proba(X)[:, 1]
print(y_pred)
print(y_pred_prob)

[1 0 1 ..., 1 1 1]
[ 0.52676921  0.16806627  0.533359   ...,  0.5403611   0.75754036
  0.5544593 ]
