<div style="font-size: 14pt;">Prof. Krzysztof Rybinski</div><br/><br/>
<div style="font-size: 22pt;"><b>Artificial Intelligence course</b></div><br/><br/>
<div style="font-size: 18pt;">LAB 3</div><br/>
<div style="font-size: 18pt;">- Classification, logistic regression</div><br/>
<div style="font-size: 18pt;">- Train set, test set</div><br/>
<div style="font-size: 18pt;">- Confusion matrix</div><br/>
<div style="font-size: 18pt;">- Accuracy, other model quality metrics</div><br/>
<div style="font-size: 18pt;">- ROC curve, threshold selection</div><br/>


In [None]:
#load necessary packages
import os
import pandas as pd
import statsmodels.api as sm
import statsmodels.graphics.api as smg
import matplotlib.pyplot as plt 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
import seaborn as sns

In [None]:
#check current folder
os.chdir("../Python")
os.getcwd()

In [None]:
#load and inspect KPI data
kpi = pd.read_csv("data/KPI_data_for_logit_model.csv")
kpi.columns

In [None]:
#select columns that we will use
kpi = kpi[['KPI_assessment_1', 'Number.of.questionnaires_1', 'Average_grade_1', \
           'NPS_1', 'Additional_achievement_1', 'X502010_1']]
kpi.columns 

In [None]:
kpi

<div style="font-size: 14pt;">Exercise 1</div>
Estimate logistic regression model from lecture 3

In [None]:
#check for multicollinearity
corr = kpi.corr()
smg.plot_corr(corr, xnames=corr.columns.values)
smg.plot_corr(corr, xnames=corr.columns.values, normcolor=True)
plt.show()

In [None]:
#estimate the model using robust standard errors
y502010 = kpi[['X502010_1']]
X = kpi.drop(['X502010_1'], axis=1)
X = sm.add_constant(X)
model_kpi_sm = sm.Logit(y502010, X)
res_kpi_sm = model_kpi_sm.fit(method='newton', cov_type="hc0")
res_kpi_sm.summary()

<div style="font-size: 14pt;">Exercise 2</div>
Implement ML approach to classification problem
Create train and test sets, train the model, make predicitons on test set, calculate accuracy

In [None]:
#now we implement machine learning approach to classification 
#split the data into train and test set
X = X.drop(['const'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(
    X, y502010, test_size=0.2, random_state=4, stratify=y502010)

#check proportions on 1s
y_train.mean(), y_test.mean()

#what is the base model accuracy?

In [None]:
y_test

In [None]:
#fit the model to the train data
model_kpi = LogisticRegression(random_state=100)
model_kpi.fit(x_train, y_train)

In [None]:
y_train.shape

In [None]:
y_train = y_train['X502010_1'].ravel()
y_test = y_test['X502010_1'].ravel()

In [None]:
y_train.shape

In [None]:
model_kpi.fit(x_train, y_train)
model_kpi.coef_, model_kpi.intercept_

In [None]:
X.columns

In [None]:
#calculate predictions
y_pred=model_kpi.predict(x_test)
print(y_pred)

In [None]:
#compute and plot confussion matrix
cfm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
cfm = cfm / cfm.sum().sum()
sns.heatmap(cfm, annot=True)

In [None]:
#calculate accuracy
accuracy = cfm.iloc[0,0] + cfm.iloc[1,1]
accuracy

In [None]:
#print sklearn clasification report
target_names = ['no raise', 'wage raise']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
#improve the model by using ROC curve to calculate threshold
#get probabilities for y_pred
y_prob = model_kpi.predict_proba(x_test)
y_prob = y_prob[:,1]
print(y_pred, '\n', y_prob)

In [None]:
y_pred.shape, y_prob.shape

In [None]:
#get tpr, fpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
str_thresholds = [str(round(x,2)) for x in thresholds]
print(fpr, '\n', tpr, '\n', str_thresholds)

In [None]:
#change every second threshold to dot
sel = list(range(len(str_thresholds))[::2])
for ind in sel:
    str_thresholds[ind] = '.'
print(str_thresholds)

In [None]:
# plot the roc curve for the model
plt.plot(fpr, tpr, marker='.', label='Logistic')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()

for i in range(len(thresholds)): 
    plt.text(fpr[i], tpr[i], str_thresholds[i], fontsize=10)

plt.show()

In [None]:
#calculate ROC AUC (area under curve)
roc_auc_score(y_test, y_prob)

In [None]:
#inspect confusion matrix for different threshold values

############################################################
def calculate_confusion_matrix(y_test, y_prob, threshold):
    y_pred_t = 1*(y_prob >= threshold)
    cfm = pd.crosstab(y_test, y_pred_t, rownames=['Actual'], colnames=['Predicted'])
    cfm = cfm / cfm.sum().sum()
    sns.heatmap(cfm, annot=True)
    accuracy = cfm.iloc[0,0] + cfm.iloc[1,1]
    print("`Accuracy =  {:6.4f}".format(accuracy))
    return cfm, accuracy
############################################################

In [None]:
#try different thresholds, comment on changes in confussion matrix
cfm , accuracy = calculate_confusion_matrix(y_test, y_prob, 0.3)

In [None]:
# using sklearn function to compute confusion matrix
cfm2 = confusion_matrix(y_test, y_pred)
cfm2

In [None]:
cfm2/cfm2.sum().sum()