The deadline for this homework is on **08.11.2023 08:59** (right before the practice session). After completing the exercises, you should

1. Download this file into your computer (`File` $\to$ `Download .ipynb`)

2. Name the file in the following way *HWx_NameSurname* (for example `HW5_NshanPotikyan.ipynb`)

4. Send the file to this email address `nshan.potikyan@gmail.com` with subject **ML5**

**Note**

* if you do not follow any of the above conditions, your homework will not be graded.

* you do not need to send any dataset files or helper scripts that I provide with your homework (since I already have them).

* you need to write the code for the exercises yourself; you can use ``built-in functions``, ``numpy``, ``pandas``, ``sklearn``
and ``matplotlib``.

**Problem.** During the practice session we experimented with the Gradient Boosting algorithm and evaluated its performance in terms of AUC/ROC.

* In this homework, you need to take the Greman Credit dataset that we worked with during the practice session but this time you need to

 * perform feature engineering (remove unnecessary features, combine several features into one etc.)
 * experiment with different classification methods (the ones that we have discussed so far including Logistic Regression) and compare their performance in terms of AUC
 * then select the best method according to the previous step and find the optimal value of the confidence threshold, so that the pre-defined cost for this dataset is minimal.

 Fix the random seed for algorithms that have random components (``random_seed=42``).

# Credit Dataset

In [74]:
import pandas as pd
import numpy as np
import time
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [75]:
data = fetch_openml(name='credit-g')

  warn(
  warn(


In [76]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [77]:
y1 = pd.Series(LabelEncoder().fit_transform(y))

In [78]:
X.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes


In [79]:
encoder = OneHotEncoder(sparse=False)
X_encoded = encoder.fit_transform(X)
X_encoded = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(X.columns))



In [80]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [81]:
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled,
                                                    y1, test_size=0.3,
                                                    random_state=0)

X_val, X_test, y_val, y_test = train_test_split(X_temp,
                                                y_temp, test_size=0.5,
                                                random_state=0)

In [82]:
def cost(conf_matrix):
    fp = conf_matrix[0][1]
    fn = conf_matrix[1][0]
    return 5 * fp + fn

def best_threshold(y_probs, y_val):
    thresholds = np.unique(y_probs[:, 1])
    costs = []

    for threshold in thresholds:
        y_predictions = (y_probs[:, 1] > threshold).astype(int)
        conf_matrix = confusion_matrix(y_val, y_predictions)
        costs.append(cost(conf_matrix))

    min_cost_idx = np.argmin(costs)
    threshold_optimal = thresholds[min_cost_idx]

    y_predictions = (y_probs[:, 1] > threshold_optimal).astype(int)
    print("Conf matrix:", confusion_matrix(y_val, y_predictions))

    AUC = roc_auc_score(y_val, y_probs[:, 1])
    print("AUC: ", AUC)

    accuracy = accuracy_score(y_val, y_predictions)
    print("Accuracy: ", accuracy)

    return threshold_optimal

# Gradient Boosting

In [83]:
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'criterion': ['friedman_mse', 'squared_error'],
    'max_features': ['sqrt', 'log2']
    }


model = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_model = GradientBoostingClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

y_probs = best_model.predict_proba(X_val)
y_preds = best_model.predict(X_val)

threshold = best_threshold(y_probs, y_val)

Conf matrix: [[43  1]
 [82 24]]
AUC:  0.7006861063464837
Accuracy:  0.44666666666666666


# Logistic Regression

In [85]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1],
    'solver' : ['sag', 'saga', 'newton-cg', 'liblinear']
    }

model = LogisticRegression(max_iter=1000, random_state=42)

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_model = LogisticRegression(**best_params, max_iter=1000)
best_model.fit(X_train, y_train)

y_probs = best_model.predict_proba(X_val)
y_preds = best_model.predict(X_val)

threshold = best_threshold(y_probs, y_val)

Conf matrix: [[38  6]
 [51 55]]
AUC:  0.7450686106346484
Accuracy:  0.62


# SVC

In [86]:
param_grid = {
    'C': [0.001, 0.1, 0.5, 1],
    'kernel': ['linear', 'poly', 'sigmoid'],
}

model = SVC(probability=True)

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_model = SVC(**best_params, random_state=42, probability=True)
best_model.fit(X_train, y_train)

y_probs = best_model.predict_proba(X_val)
y_preds = best_model.predict(X_val)

threshold = best_threshold(y_probs, y_val)

Conf matrix: [[37  7]
 [48 58]]
AUC:  0.7347770154373927
Accuracy:  0.6333333333333333


# Random Forest

In [87]:
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion' : ['entropy', 'gini'],
    'max_features' : [None, 'sqrt', 'log2']
}

model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

best_model = RandomForestClassifier(**best_params)
best_model.fit(X_train, y_train)

y_probs = best_model.predict_proba(X_val)
y_preds = best_model.predict(X_val)

threshold = best_threshold(y_probs, y_val)

Conf matrix: [[43  1]
 [77 29]]
AUC:  0.7210548885077187
Accuracy:  0.48
