***
# ISOM3360 Data Mining for Business Analytics
## Group 23 Project Code - Credit Card Defaultee Analysis
### Part 2.2 - Logistic Regression Classifier
***

Name: LAM, Ho Chit  
ITSC: hclamao   
SID: 20607878 

Name: LEE, Ho Wan Owen  
ITSC: hwolee  
SID: 20604852

Name: LEE, Wai Chung  
ITSC: wcleeaj  
SID: 20702733

### Workflow of this notebook (TBC)

1. Explore features and characteristics of dataset
2. Drop columns of low data quality (e.g. large amounts of empty values)
3. Determine $k$ columns to keep in the dataset (feature selection)
4. Perform one-hot encoding
5. Split into training and testing sets
6. Perform data cleaning
   - Dealing with missing values
7. Perform data standardization / normalization
8. Export preprocessed data to .csv files at `./data_preprocessed/`

### Logistic Regression
we will use all the train data (891 examples) to construct the tree and evaluate the model

In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# import evaluation tools
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [None]:
train = pd.read_csv('train_clean.csv',index_col = 'PassengerId')

In [None]:
# define attributes and target variable
features = ['Pclass','Age','SibSp','Parch','Fare','Sex_male','Embarked_Q','Embarked_S']
target = ['Survived']
X = train [features]
y = train [target]

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into training and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## ROC curve and decision thresh

In [None]:
from sklearn import preprocessing
zscore_scaler = preprocessing.StandardScaler().fit(X)
X_lr = pd.DataFrame(zscore_scaler.transform(X), columns = X.columns)

In [None]:
# Split the data into training and test.
X_train_lr, X_test_lr, y_train, y_test = train_test_split(X_lr, y, test_size=0.2, random_state=42)

In [None]:
# import LogisticReression
from sklearn.linear_model import LogisticRegression

In [None]:
# Parameters for the logistic regression model
logistic_model = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# Cfloat, default=1.0. Inverse of regularization strength; must be a positive float. Smaller values specify stronger regularization.

In [None]:
# import evaluation tools
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
y_array_train = y_train.values.ravel()

In [None]:
logistic_score = cross_val_score(logistic_model, X_train, y_array_train, cv=10)
print(logistic_score)
print (logistic_score.mean())

In [None]:
# cross validation to get the prediction
logistic_preds = cross_val_predict(logistic_model, X_train, y_array_train, cv=10)

In [None]:
print (f'This is prediction for logistic regression: {logistic_preds}')

In [None]:
print(accuracy_score(y_array_train, logistic_preds, normalize=True, sample_weight=None))
print(confusion_matrix(y_array_train, logistic_preds))

In [None]:
#choose 'predict_proba' (which gives probabilities) for the method parameter.
# This is importat because by default cross_val_predict assumes you want to use the 'predict' method
logistic_preds_prob = cross_val_predict(logistic_model, X_train, y_array_train, cv=10, method="predict_proba")

In [None]:
logistic_preds_prob

In [None]:
logistic_preds_prob_1 = cross_val_predict(logistic_model, X_train, y_array_train, cv=10, method="predict_proba")[:,1]

In [None]:
logistic_preds_prob_1

In [None]:
# increase threshold for '1' to be 0.6 and get an updated prediction
logistic_preds_prob_1_= [1 if i > 0.6 else 0 for i in logistic_preds_prob_1]

In [None]:
# result after adjust the threshold
print(confusion_matrix(y_array_train, logistic_preds_prob_1_))
print(accuracy_score(y_array_train, logistic_preds_prob_1_, normalize=True, sample_weight=None))

In [None]:
# Get false positive and true positives for each model
logistic_fpr, logistic_tpr, thresholds = roc_curve(y_train, logistic_preds_prob_1)

In [None]:
# Plot the ROC curves
plt.figure(figsize=(12, 10))
plt.plot(logistic_fpr, logistic_tpr, label="logistic")
plt.legend()
plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.show()

In [None]:
# Get AUC
logistic_auc = auc(logistic_fpr, logistic_tpr)
# Print results
pd.DataFrame({"Model":['Logistic Regression'], "AUC":[logistic_auc]})

One can also use AUC as the performance measure of a grid search by including the parameter `scoring="roc_auc"`.

In [None]:
from sklearn.model_selection import GridSearchCV

# Search for best complexity when optimizing for AUC within DecisionTreeClassifier
grid = {"min_samples_leaf":[10,25, 50, 100, 150, 200]}
searcher = GridSearchCV(DecisionTreeClassifier(), grid, cv=10, scoring="roc_auc")
searcher.fit(X_train, y_train)

# Plot result
print(f"Best tree min_samples_leaf: {searcher.best_params_}")
print(f"Best AUC: {searcher.best_score_}")

In [None]:
# Search for best complexity when optimizing for AUC for logistic regression
l_grid = {"C": [10**i for i in np.arange(-1, 2, 0.1)]}
l_searcher = GridSearchCV(logistic_model, l_grid, cv=10, scoring="roc_auc")
l_searcher.fit(X_train_lr, y_array_train)

# Plot result
print(f"Best C for logsitic regression: {l_searcher.best_params_}")
print(f"Best AUC: {l_searcher.best_score_}")