# <center> Logistic Regression

## <center> Outcomes

You should be able to:
- Implement logistic regression
- Understand confusion matrices
- Understand AUC and ROC curves
- Select the best classifier model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.datasets import make_classification
import seaborn as sns
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})
%matplotlib inline

In [None]:
## generate data
X,y = make_classification(n_samples=100, n_features=1,n_informative=1,n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=4)
X = X+3
df = pd.DataFrame(X.tolist(),columns=['X1'])
df['target']=y
df.head()

In [None]:
## visualize data
plt.scatter(X,y)

In [None]:
## simple linear regression
model = sm.OLS(y,sm.add_constant(X)).fit()
model.summary()

In [None]:
## regression line
plt.scatter(X, y)
X_lin = np.linspace(1,5,100)
plt.plot(X_lin, model.predict(sm.add_constant(X_lin)), color='orange')

## <center>Sigmoid Function

In [None]:
def sigmoid(x):
    return 1 / (1 + np.e**(-x))

In [None]:
X_lin = np.linspace(-20,20,100)
plt.plot(X_lin, sigmoid(X_lin))

In [None]:
## comparing different sigmoid functions
X_lin = np.linspace(-20,20,100)
for i in range(1,5):
    for j in range(-4,5,2):
        plt.plot(X_lin, sigmoid(X_lin*i+j))
        plt.title('i='+str(i))
        plt.legend(['-4','-2','0','2','4'],title='j')
    plt.show()

## <center>Logistic Regression

- Based on probability and maximum likelihood estimation (MLE)
- Instead of predicting continuous values, predicts binary class labels
- Performs classification by predicting either a 0 or 1 for each data point

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
## fit logistic model
model = LogisticRegression().fit(X,y)
plt.scatter(X,y)
X_lin = np.linspace(1,5,100)
plt.plot(X_lin,model.predict(X_lin.reshape(-1,1)),color='orange')

In [None]:
model.coef_, model.intercept_

In [None]:
plt.scatter(X,y)
X_lin = np.linspace(0,6,100)
plt.plot(X_lin, sigmoid(X_lin*model.coef_+model.intercept_)[0])

### With two features...

In [None]:
X,y = make_classification(class_sep=1.5,n_samples=100, n_features=2,n_informative=2,n_redundant=0, n_classes=2, n_clusters_per_class=1, random_state=222)
X = X+3
df = pd.DataFrame(X.tolist(),columns=['X1','X2'])
df['target']=y
df.head()

In [None]:
df.plot.scatter(x='X1',y='X2',c='target',cmap='winter')

In [None]:
model = LogisticRegression(solver='lbfgs').fit(X,y)
print('Coefficients:',model.coef_[0])
print('Intecept:',model.intercept_)

In [None]:
X_lin=np.linspace(-1,7,100)
b = model.intercept_
w = model.coef_[0]
y_lin = (w[0]*X_lin+b)/(-1*w[1])
plt.scatter(X[:,0], X[:,1],c=y,cmap='winter')
plt.plot(X_lin, y_lin,color='orange')
plt.ylim(-2,8)
plt.show()

### Comparing logistic models...

In [None]:
X,y = make_classification(flip_y=.25,n_clusters_per_class=1,class_sep=0.5, n_samples=1000, n_features=4,n_informative=2,n_classes=2, random_state=44477)
X = X+3
df = pd.DataFrame(X.tolist(),columns=['X1','X2','X3','X4'])
df['target']=y
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=14)

In [None]:
## base model
base_model = LogisticRegression().fit(X_train,y_train)
print('Train accuracy:',accuracy_score(y_train,base_model.predict(X_train)))
print('Test accuracy:',accuracy_score(y_test,base_model.predict(X_test)))

In [None]:
## RFE to find best n features
from sklearn.feature_selection import RFE
i_,train_acc, test_acc, models = [],[],[],[]
for i in range(1,X.shape[1]):
    i_.append(i)
    selector = RFE(LogisticRegression(solver='lbfgs'),i)
    selector.fit(X_train,y_train)
    train_acc.append(accuracy_score(y_train,selector.predict(X_train)))
    test_acc.append(accuracy_score(y_test,selector.predict(X_test)))
    models.append(selector)

In [None]:
plt.plot(i_,train_acc)
plt.plot(i_,test_acc)
plt.legend(['Train','Test'])
plt.title('Model Accuracy')
plt.xlabel('Features')

In [None]:
print(train_acc), print(test_acc)

<center><img src='confusion_matrix.png' height=1000 width=1000>

In [None]:
from sklearn.metrics import confusion_matrix
fig, (ax1, ax2, ax3) = plt.subplots(1,3)
fig.set_size_inches(20,5)
sns.heatmap(confusion_matrix(y_test, models[0].predict(X_test)), ax=ax1, annot=True,annot_kws={"size": 16},cmap='coolwarm');ax1.set_title('1 feature')
sns.heatmap(confusion_matrix(y_test, models[1].predict(X_test)), ax=ax2, annot=True,annot_kws={"size": 16},cmap='coolwarm');ax2.set_title('2 features')
sns.heatmap(confusion_matrix(y_test, models[2].predict(X_test)), ax=ax3, annot=True,annot_kws={"size": 16},cmap='coolwarm');ax3.set_title('3 features')

In [None]:
## finding best feature
model = RFE(LogisticRegression(solver='lbfgs'),1).fit(X_train,y_train)
model.support_

## <center> Sensitivity and Specificity

<center> Sensitivity = True Positives / Total Positives <br>
Specificity = True Negatives / Total Negatives 

<center> Typically inversely proportional to each other - when we increase one, we decrease the other

### <center> What's more important to minimize - false positives or false negatives?

In [None]:
sns.heatmap(confusion_matrix(y_test, model.predict(X_test)), annot=True,annot_kws={"size": 16},cmap='coolwarm', fmt='g')

In [None]:
sensitivity = 
print(sensitivity)

In [None]:
specificity = 
print(specificity)

In [None]:
## get probabilities for each predictions
probs = model.predict_proba(X_test)
probs[0:10]

In [None]:
## get probabilities for predicting 1
positive_probs = probs[:,1]
positive_probs[0:10]

In [None]:
## setting custom threshold for positive predictions
threshold = 0.50
decisions = [1 if i>threshold else 0 for i in positive_probs]
decisions

In [None]:
results = pd.DataFrame()
thresholds = [0.15, 0.25, 0.50, 0.75, 0.85]
spec,sens = [],[]
for t in thresholds:
    decisions = [1 if i>t else 0 for i in positive_probs]
    tn, fp, fn, tp = confusion_matrix(y_test, decisions).ravel()
    sens.append(tp/(tp+fn))
    spec.append(tn/(tn+fp))
results['Threshold'] = thresholds
results['Sensitivity'] = sens
results['Specificity'] = spec
results

## <center> AUC and ROC

<center> ROC curves show the tradeoff between specificity and sensitivity.
<center> Specificity is on the y-axis while 1-sensitivity is on the x-axis.

<center> AUC is the area under the ROC curve. <br>
A model has a perfect fit if it has an AUC of 1.0.<br>
A naive model (random guessing) will have an AUC of 0.5.

In [None]:
from sklearn.metrics import roc_curve, auc
def plot_roc(x,y,model):
    plt.figure(figsize=(10, 8))
    y_pred_probs = model.predict_proba(x)[:,1]
    fpr, tpr, thresholds = roc_curve(y, y_pred_probs)
    sns.set_style('darkgrid', {'axes.facecolor': '0.9'})
    print('AUC:',auc(fpr, tpr))
    plt.plot(fpr, tpr, color='orangered',lw=2, label='ROC')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.yticks([i/20.0 for i in range(21)])
    plt.xticks([i/20.0 for i in range(21)])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) Curve')
    plt.legend(loc='lower right')

In [None]:
plot_roc(X_test,y_test,model)

## <center> Class Imbalance

In [None]:
df['target'].value_counts()

In [None]:
X,y = make_classification(weights=[0.02],n_clusters_per_class=1,class_sep=2, n_samples=1000, n_features=4,n_informative=2,n_classes=2, random_state=44)
X = X+3
df = pd.DataFrame(X.tolist(),columns=['X1','X2','X3','X4'])
df['target']=y
print(df['target'].value_counts())
df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=14)

In [None]:
model = LogisticRegression(solver='lbfgs').fit(X_train,y_train)
print('Test Accuracy:', accuracy_score(y_test, model.predict(X_test)))

In [None]:
cm = confusion_matrix(y_test, model.predict(X_test))
sns.heatmap(cm, annot=True,annot_kws={"size": 16},cmap='coolwarm', fmt='g')

In [None]:
base_sensitivity = cm[1][1]/(cm[1][1]+cm[0][1])
base_specificity = cm[0][0]/(cm[0][0]+cm[1][0])
print('Sensitivity:', base_sensitivity)
print('Specificity:', base_specificity)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=47)
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train)
model = LogisticRegression(solver='lbfgs').fit(X_train,y_train)

In [None]:
pd.Series(y_train).value_counts()

In [None]:
pd.Series(y_train_resampled).value_counts()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, model.predict(X_test)).ravel()
print('Before SMOTE')
print('-------------')
print('Sensitivity:',base_sensitivity)
print('Specificity:',base_specificity);print()
print('After SMOTE')
print('-------------')
print('Sensitivity:',tp/(tp+fn))
print('Specificity:',tn/(tn+fp))

## <center> Activity
Using the data found in <i>loan_data.csv</i>, build a logistic regression model to predict whether future loan applicants will default on their loan or not. <br>
Then, use your best model to make predictions on new customers whose data can be found in <i>new_customers.csv</i>.<br><br>
<b>Hints</b>
- Categorical variables can be included in logistic regression, just make sure you transform them first
- Remember to always scale your data if your features are in different scales