In [None]:
!pip install optuna

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score
import optuna
import warnings
from sklearn.datasets import load_wine
warnings.filterwarnings('ignore')

df = pd.read_csv('./loan_data.csv')
df

Unnamed: 0,ID,Age,Experience,ZIP Code,Family,Education,Mortgage,Securities Account,CD Account,Online,CreditCard,Personal Loan
0,3510,38,12,91330,3,3,0,0,0,0,0,0
1,1129,30,5,94025,2,2,0,0,0,0,0,1
2,1637,65,39,92122,4,3,0,0,0,0,1,0
3,3165,28,4,95136,4,1,0,0,0,1,1,0
4,3563,32,8,94596,1,3,272,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3598,56,26,92028,3,3,0,0,0,1,0,0
3996,4671,52,26,94305,1,1,0,0,0,1,0,0
3997,989,63,39,94998,1,2,100,0,0,0,0,0
3998,2038,35,8,95616,2,2,0,0,0,0,1,0


In [3]:
df1 = df.copy()
df1.drop(['ID','ZIP Code'], axis = 1, inplace = True)
df1

Unnamed: 0,Age,Experience,Family,Education,Mortgage,Securities Account,CD Account,Online,CreditCard,Personal Loan
0,38,12,3,3,0,0,0,0,0,0
1,30,5,2,2,0,0,0,0,0,1
2,65,39,4,3,0,0,0,0,1,0
3,28,4,4,1,0,0,0,1,1,0
4,32,8,1,3,272,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
3995,56,26,3,3,0,0,0,1,0,0
3996,52,26,1,1,0,0,0,1,0,0
3997,63,39,1,2,100,0,0,0,0,0
3998,35,8,2,2,0,0,0,0,1,0


In [4]:
y_target = df1['Personal Loan']
X_features = df1.drop(['Personal Loan'],axis=1,inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2800, 9), (1200, 9), (2800,), (1200,))

In [6]:
from sklearn.model_selection import cross_val_score

def objective1(trial):
    tree_1 = trial.suggest_int('max_depth', 1, 30)
    tree_2 = trial.suggest_int('min_samples_leaf', 1, 30)
    tree_3 = trial.suggest_int('min_samples_split', 1, 30)
    #tree_4= trial.suggest_int('max_features', 10, 10)
    tree_5 = trial.suggest_int('max_leaf_nodes', 1, 50)

    classifier_obj = DecisionTreeClassifier(
        max_depth= tree_1, 
        min_samples_leaf= tree_2,
        min_samples_split= tree_3,
        #max_features= tree_4,
        max_leaf_nodes= tree_5,
        random_state=42
    )

    score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    accuracy = score.mean()
    return accuracy

tree_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
tree_study.optimize(objective1, n_trials=50) 

print("Best score:", tree_study.best_value)
print("Best parameters:", tree_study.best_params)

model1 = DecisionTreeClassifier(**tree_study.best_trial.params)
model1.fit(X_train, y_train)

ROC_AUC = roc_auc_score(y_test, model1.predict_proba(X_test)[:,1])
print('ROC_AUC_score :', ROC_AUC)

[32m[I 2023-04-07 22:23:29,956][0m A new study created in memory with name: no-name-8be71bd6-c8de-40d9-a8d4-53f72bde7b2d[0m
[32m[I 2023-04-07 22:23:32,367][0m Trial 0 finished with value: 0.7687536604425749 and parameters: {'max_depth': 17, 'min_samples_leaf': 9, 'min_samples_split': 13, 'max_leaf_nodes': 43}. Best is trial 0 with value: 0.7687536604425749.[0m
[32m[I 2023-04-07 22:23:33,330][0m Trial 1 finished with value: 0.6231070607060705 and parameters: {'max_depth': 1, 'min_samples_leaf': 4, 'min_samples_split': 21, 'max_leaf_nodes': 42}. Best is trial 0 with value: 0.7687536604425749.[0m
[32m[I 2023-04-07 22:23:34,157][0m Trial 2 finished with value: 0.7712768275552044 and parameters: {'max_depth': 5, 'min_samples_leaf': 18, 'min_samples_split': 27, 'max_leaf_nodes': 11}. Best is trial 2 with value: 0.7712768275552044.[0m
[32m[I 2023-04-07 22:23:34,173][0m Trial 3 finished with value: 0.7668668517361941 and parameters: {'max_depth': 6, 'min_samples_leaf': 4, 'min_sa

[32m[I 2023-04-07 22:23:34,832][0m Trial 34 finished with value: 0.7749139809389103 and parameters: {'max_depth': 26, 'min_samples_leaf': 15, 'min_samples_split': 17, 'max_leaf_nodes': 31}. Best is trial 20 with value: 0.7835522284371295.[0m
[32m[I 2023-04-07 22:23:34,855][0m Trial 35 finished with value: 0.7844908411759544 and parameters: {'max_depth': 18, 'min_samples_leaf': 11, 'min_samples_split': 20, 'max_leaf_nodes': 18}. Best is trial 35 with value: 0.7844908411759544.[0m
[32m[I 2023-04-07 22:23:34,879][0m Trial 36 finished with value: 0.7624629612195912 and parameters: {'max_depth': 18, 'min_samples_leaf': 8, 'min_samples_split': 20, 'max_leaf_nodes': 14}. Best is trial 35 with value: 0.7844908411759544.[0m
[32m[I 2023-04-07 22:23:34,902][0m Trial 37 finished with value: 0.762439696137981 and parameters: {'max_depth': 13, 'min_samples_leaf': 10, 'min_samples_split': 23, 'max_leaf_nodes': 11}. Best is trial 35 with value: 0.7844908411759544.[0m
[32m[I 2023-04-07 22:

Best score: 0.7869871055983149
Best parameters: {'max_depth': 24, 'min_samples_leaf': 10, 'min_samples_split': 11, 'max_leaf_nodes': 28}
ROC_AUC_score : 0.8274233482566814


In [None]:
def objective2(trial):
    knn_n_neighbors= trial.suggest_int('n_neighbors', 1, 300)
    knn_weights= trial.suggest_categorical('weights', ['uniform','distance'])
    knn_algorithm= trial.suggest_categorical('algorithm', ['auto','ball_tree', 'kd_tree', 'brute'])
    knn_p= trial.suggest_int('p', 1, 2)
    knn_leaf_size= trial.suggest_int('leaf_size', 3, 5)
    
    classifier_obj= KNeighborsClassifier(
        n_neighbors= knn_n_neighbors, 
        weights= knn_weights,
        algorithm= knn_algorithm,
        p= knn_p,
        leaf_size= knn_leaf_size
    )
    
    score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    accuracy = score.mean()
    return accuracy

knn_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
knn_study.optimize(objective2, n_trials=12) 

print("Best score:", knn_study.best_value)
print("Best parameters:", knn_study.best_params)

model2 = KNeighborsClassifier(**knn_study.best_params)
model2.fit(X_train, y_train)

ROC_AUC = roc_auc_score(y_test, model2.predict_proba(X_test)[:,1])
print('ROC_AUC_score :', ROC_AUC)

In [None]:
def objective3(trial):
    logit_penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', 'none'])
    logit_C = trial.suggest_float('C', 0.01, 105)
    logit_class_weight = trial.suggest_categorical('class_weight', ['balanced', 'None'])
    logit_solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
    
    classifier_obj = LogisticRegression(
        penalty = logit_penalty, 
        C = logit_C,
        class_weight = logit_class_weight,
        solver = logit_solver
    )

    score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
    accuracy = score.mean()
    return accuracy

logit_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
logit_study.optimize(objective3, n_trials=12)

print("Best score:", logit_study.best_value)
print("Best parameters:", logit_study.best_params)

model3 = LogisticRegression(**logit_study.best_params)
model3.fit(X_train, y_train)

ROC_AUC = roc_auc_score(y_test, model3.predict_proba(X_test)[:,1])
print('ROC_AUC_score :', ROC_AUC)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,6))

for i in {model1, model2, model3}:
    RocCurveDisplay.from_estimator(i, X_test, y_test, ax=ax)

ax.set_title('ROC curve')
plt.show()