In [None]:
#Logistic regression model using a composite proxy target variable

#Required packages
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

#Model data
df = pd.read_csv('T:\Data Path\df_ml2.csv')
df = df.loc[:, df.columns != 'id'] #ID coulmn is not required for regression analysis

In [None]:
#Logistic regression
X = df[['med_age', 'age_45p', 'age_65p', 'female_p', 'avg_mincome', 'avg_fincome', 'medu_yrs', 'fedu_yrs', 'hh_size', 'poverty_p', 'unemp_rate']]
y = df['proxy_target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
#Confusion matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#Confusion matrix and heatmap
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

class_names = [0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

sns.heatmap(pd.DataFrame(cnf_matrix), cmap = "YlGnBu", annot=True, fmt = 'g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.ylabel('Actual Class')
plt.xlabel('Predicted Class')

In [None]:
#Model evaluation
train_score = format(model.score(X_train, y_train))
test_score = format(model.score(X_test, y_test))
print('Accuracy of logistic regression on training set:', train_score)
print('Accuracy of logistic regression on test set:', test_score)

In [None]:
print("Accuracy of logistic model:",metrics.accuracy_score(y_test, y_pred))
print("Precision of logistic model:",metrics.precision_score(y_test, y_pred))
print("Recall of logistic model:",metrics.recall_score(y_test, y_pred))

In [None]:
#ROC curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt 

logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
#Coefficients of logistic model
model_coef = model.coef_
print(pd.DataFrame(model_coef.T,
             index = [X.columns.values],
             columns = ['Coefficient']))

In [None]:
#Odds ratio from logistic model
ods_ratio = np.exp(model.coef_)
print(pd.DataFrame(np.exp(model.coef_).T,
             index = [X.columns.values],
             columns = ['Odds Ratio']))
print('Note: In logistic models, odds ratios are used instead of model coefficients to explain relationships and magnitudes.')