# Exploring the Response Variable and Concluding the Initial Exploration

Import packages

In [None]:
import numpy as np #numerical computation
import pandas as pd #data wrangling
import matplotlib.pyplot as plt #plotting package
#Next line helps with rendering plots
%matplotlib inline
import matplotlib as mpl #add'l plotting functionality
mpl.rcParams['figure.dpi'] = 400 #high res figures

In [None]:
df = pd.read_csv('../Data/Chapter_1_cleaned_data.csv')

In [None]:
df['default payment next month'].mean()

In [None]:
df.groupby('default payment next month')['ID'].count()

# Introduction to Scikit-Learn

In [None]:
#Import the class
from sklearn.linear_model import LogisticRegression

In [None]:
#Create an object
my_lr = LogisticRegression()

In [None]:
my_lr.get_params()

In [None]:
from sklearn import set_config
set_config(print_changed_only=False)

In [None]:
#Examine it
my_lr

In [None]:
#Instantiate while specifying keyword arguments
my_new_lr = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [None]:
my_new_lr.C = 0.1
my_new_lr.solver = 'liblinear'
my_new_lr.multi_class='auto'

In [None]:
my_new_lr

In [None]:
X = df['EDUCATION'][0:10].values.reshape(-1,1)
X

In [None]:
y = df['default payment next month'][0:10].values
y

In [None]:
my_new_lr.fit(X, y)

In [None]:
new_X = df['EDUCATION'][10:20].values.reshape(-1,1)
new_X

In [None]:
my_new_lr.predict(new_X)

In [None]:
df['default payment next month'][10:20].values

# Generating Synthetic Data

In [None]:
np.random.seed(seed=1)
X = np.random.uniform(low=0.0, high=10.0, size=(1000,))
X[0:10]

In [None]:
type(X)

# Data for a Linear Regression

In [None]:
np.random.seed(seed=1)
slope = 0.25
intercept = -1.25
y = slope * X + np.random.normal(loc=0.0, scale=1.0, size=(1000,)) + intercept

In [None]:
mpl.rcParams['figure.dpi'] = 400

In [None]:
plt.scatter(X,y,s=1)

# Exercise 8: Linear regression in Scikit-Learn

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()

In [None]:
lin_reg

In [None]:
lin_reg.fit(X.reshape(-1,1), y)

In [None]:
print(lin_reg.intercept_)
print(lin_reg.coef_)

In [None]:
y_pred = lin_reg.predict(X.reshape(-1,1))

In [None]:
plt.scatter(X,y,s=1)
plt.plot(X,y_pred,'r')

# Model performance metrics for binary classification
## Splitting the data: training and testing sets

Split the data in to training (80%) and testing (20%). Introduce metrics using testing scores.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
df['EDUCATION'].values.reshape(-1,1), df['default payment next month'].values,
test_size=0.2, random_state=24)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
print(y_train, y_test)

In [None]:
np.mean(y_train)

In [None]:
np.mean(y_test)

## Classification accuracy

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
example_lr = LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [None]:
example_lr.fit(X_train, y_train)

In [None]:
y_pred = example_lr.predict(X_test)

Accuracy

In [None]:
is_correct = y_pred == y_test

In [None]:
np.mean(is_correct)

In [None]:
example_lr.score(X_test, y_test)

In [None]:
from sklearn import metrics

In [None]:
metrics.accuracy_score(y_test, y_pred)

# Exercise 9: Calculating the True and False Positive and Negative Rates and Confusion Matrix in Python

In [None]:
P = sum(y_test)
P

In [None]:
TP = sum( (y_test==1) & (y_pred==1) )
TP

In [None]:
TPR = TP/P
TPR

In [None]:
FN = sum( (y_test==1) & (y_pred==0) )
FN

In [None]:
FNR = FN/P
FNR

In [None]:
N = sum(y_test==0)
N

In [None]:
TN = sum( (y_test==0) & (y_pred==0))
TN

In [None]:
FP = sum( (y_test==0) & (y_pred==1))
FP

In [None]:
TNR = TN/N
FPR = FP/N
print('The true negative rate is {} and the false positive rate is {}'.format(TNR, FPR))

In [None]:
metrics.confusion_matrix(y_test, y_pred)

# Exercise 10: Obtaining Predicted Probabilities from a Trained Logistic Regression Model

In [None]:
y_pred_proba = example_lr.predict_proba(X_test)
y_pred_proba

In [None]:
prob_sum = np.sum(y_pred_proba,1)
prob_sum

In [None]:
prob_sum.shape

In [None]:
np.unique(prob_sum)

In [None]:
pos_proba = y_pred_proba[:,1]
pos_proba

In [None]:
plt.hist(pos_proba)

In [None]:
mpl.rcParams['font.size'] = 12
plt.hist(pos_proba)
plt.xlabel('Predicted probability of positive class for testing data')
plt.ylabel('Number of samples')

In [None]:
pos_sample_pos_proba = pos_proba[y_test==1]
neg_sample_pos_proba = pos_proba[y_test==0]

In [None]:
plt.hist([pos_sample_pos_proba, neg_sample_pos_proba], histtype='barstacked')
plt.legend(['Positive samples', 'Negative samples'])
plt.xlabel('Predicted probability of positive class')
plt.ylabel('Number of samples')

# The Receiver Operating Characteristic (ROC) curve

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, pos_proba)

In [None]:
plt.plot(fpr, tpr, '*-')
plt.plot([0, 1], [0, 1], 'r--')
plt.legend(['Logistic regression', 'Random chance'])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve')

In [None]:
thresholds

In [None]:
metrics.roc_auc_score(y_test, pos_proba)

# Activity 2: Performing Logistic Regression with a New Feature and Creating a Precision-Recall Curve

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
df['LIMIT_BAL'].values.reshape(-1,1), df['default payment next month'].values,
test_size=0.2, random_state=24)

In [None]:
example_lr.fit(X_train_2, y_train_2)

In [None]:
y_test_2_pred_proba = example_lr.predict_proba(X_test_2)

In [None]:
metrics.roc_auc_score(y_test_2, y_test_2_pred_proba[:,1])

In [None]:
fpr_2, tpr_2, thresholds_2 = metrics.roc_curve(y_test_2, y_test_2_pred_proba[:,1])

In [None]:
plt.plot(fpr_2, tpr_2, '*-')
plt.plot([0, 1], [0, 1], 'r--')
plt.legend(['Logistic regression', 'Random chance'])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC curve for logistic regression with LIMIT_BAL feature')

In [None]:
precision, recall, thresh_3 = \
metrics.precision_recall_curve(y_test_2, y_test_2_pred_proba[:,1])

In [None]:
plt.plot(recall, precision, '-x')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision and recall for the logistic regression with LIMIT_BAL')
plt.xlim([0, 1])
plt.ylim([0, 1])

In [None]:
metrics.auc(recall, precision)

In [None]:
y_train_2_pred_proba = example_lr.predict_proba(X_train_2)

In [None]:
metrics.roc_auc_score(y_train_2, y_train_2_pred_proba[:,1])