In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier


: 

In [None]:
df = pd.read_csv('D:\Artificial Intelligence\AAI-501\Team Project AAI 501\german.data', sep='\s+')
df.head(2)

: 

In [None]:
column_names = ['checking_account', 'duration_month', 'credit_history', 'credit_purpose','credit_amount','savings_account','present_employment','disposable_income_percent',
 'status_sex','debtors','residence_since','property','age','other_installments', 'housing','credits_at_current_bank','job','dependants','telephone', 'foreign_worker','class']

: 

In [None]:
df.columns = column_names
df.head(2)

: 

In [None]:
df.info()

: 

In [None]:
df.describe()

: 

In [None]:
for i in df.columns:
    if df[i].dtype == 'int64':
        fig, ax = plt.subplots(1, 2, figsize=(17, 3))

        sns.histplot(data=df, x=i, bins=10, ax=ax[0], hue='class')
        ax[0].set_title(f'{i} Histogram')
      

        sns.boxplot(data=df, x=i, ax=ax[1])
        ax[1].set_title(f'{i} Boxplot')
   
        plt.tight_layout()
        plt.show();

: 

In [None]:
object_columns = df.select_dtypes(include='object').columns

num_rows = (len(object_columns) + 1) // 2  

fig, axes = plt.subplots(num_rows, 2, figsize=(12, 3*num_rows))

axes = axes.flatten()

for i, col in enumerate(object_columns):
    df[col].value_counts().plot(kind='bar', ax=axes[i])
    axes[i].set_title(col)

for i in range(len(object_columns), len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.show()

: 

In [None]:
corr_matr = df.corr()
plt.figure(figsize=(8,8))
sns.heatmap(corr_matr, cmap= 'cool', annot= True)

: 

### Analayzing Good vs Bad Credit

In [None]:
bad_credit = df[df['class'] == 2]
good_credit = df[df['class'] == 1]

: 

In [None]:
bad_credit.groupby('credits_at_current_bank').mean()

: 

In [None]:
good_credit.groupby('credits_at_current_bank').mean()

: 

***Observations:***
- The **average credit amount** in people with bad credit tends to me more in general in comparison to people with good credit. In other words, bad credit has more accounts open and ask for more money
- The **average age** of people with 4 or more credit accounts at the current bank is 53 years old with good credit and 34.5 years old with bad credit 
- We can drop the dependants and residence_since. These features are very similar between people with good and bad credit. 


In [None]:
df = df.drop(columns=['dependants', 'residence_since'], axis= 1)

: 

**Observation:**

I am dropping Dependats and residence since there is no strong correlation among other variables.

**Savings account/bonds**
 
 A61 :          ... <  100 DM
 
 A62 :   100 <= ... <  500 DM
 
 A63 :   500 <= ... < 1000 DM
 
 A64 :          .. >= 1000 DM

A65 :   unknown/ no savings account

**Status of existing checking account**

A11 :      ... <    0 DM

A12 : 0 <= ... <  200 DM

A13 :      ... >= 200 DM /

salary assignments for at least 1 year

A14 : no checking account

In [None]:
print(f'Good credit:\n{good_credit.savings_account.value_counts()}')
print(f'Bad Credit:\n{bad_credit.savings_account.value_counts()}')
print(f'Good credit:\n{good_credit.checking_account.value_counts()}')
print(f'Bad Credit:\n{bad_credit.checking_account.value_counts()}')

: 

***Observations:***
- The majority of people with good credit do not have checking account at the current bank, while people with bad credit have more accoutns open but have less than 200 DM
- For the people that do have an account open, people with good credit have more than 200 DM in their account

***Maybe We should consider using one-hot encoding on checking and savings account***

In [None]:
sns.countplot(x ='credit_purpose',hue='class', data= df)
plt.title('Credit purpose per class')
plt.show();

: 

Attribute 4:  (qualitative)

Purpose

A40 : car (new)

A41 : car (used)

A42 : furniture/equipment

A43 : radio/television

A44 : domestic appliances

A45 : repairs

A46 : education

A47 : (vacation - does not exist?)

A48 : retraining

A49 : business

A410 : others


***Observations:***
- Class 1(good) use their credit for radio and televisions, cars and furniture/equipment, while people with bad credit tend to use their credit mostly for purchasing a car

***We should use one-hot encoding for credit purpose***

In [None]:
df_encoded = pd.get_dummies(df, columns=['credit_purpose'], prefix= 'credit_purpose')


: 

In [None]:
sns.barplot(x = 'foreign_worker', y ='credit_amount', data = df,  hue='class')
plt.title('Foreign Workers vs Credit Amount Requested')
plt.show();

: 

In [None]:
sns.countplot(x=df['foreign_worker'], hue = df['class'])
plt.title('Population of Foreign Workers')
plt.show();

: 

In [None]:
print(df['foreign_worker'].value_counts().sum())
print('---------------------------------')
print(df['foreign_worker'].value_counts()/999)

: 

foreign worker:

A201 : yes

A202 : no

***Observations:***
- Foreign workers, tend to ask higher amnounts of credit and tend to have a population with higher bad credit history

- **Since Foreign workers tend to have bad credit we should encode this as well and becasue it composes 96% of our dataset**

In [None]:
# Dropping Foreign Column to Avoid Multicolinearity
df_encoded = pd.get_dummies(df_encoded, columns=['foreign_worker'], drop_first=True, prefix='Foreign') 

: 

In [None]:
sns.countplot(x = 'credit_history', data=df, hue ='class')
plt.title('Credit History')
plt.show();

: 

Credit history


A30 : no credits taken/

	all credits paid back duly

          

A31 : all credits at this bank paid back duly


A32 : existing credits paid back duly till now

          

A33 : delay in paying off in the past

      

A34 : critical account/
		other credits existing (not at this bank)

***Observations:***
- Although the majority of people with good and bad credit tend to pay off all their debt. There is a significant amount of people with bad debt that have accounts classified as critical in other banks. This could mean that they are looking to get more credit with bad history in other banks


In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['credit_history'], prefix='credit_history')

: 

In [None]:
sns.countplot(data=df, x='debtors', hue='class')
plt.title('Guarantors/Debtors')
plt.show()

: 

Other debtors / guarantors

	      A101 : none

	      A102 : co-applicant

	      A103 : guarantor

***We can eliminate this column***

In [None]:
df_encoded = df_encoded.drop(columns= 'debtors', axis=1)

: 

In [None]:
sns.countplot(data = df , x = df['savings_account'], hue='class')
plt.title('Savings Account Categories')
plt.show();
sns.countplot(data = df , x = df['checking_account'], hue='class')
plt.title('Checking Account Categories')
plt.show();

: 

		checking account
		
        A11 :      ... <    0 DM

	    A12 : 0 <= ... <  200 DM

	    A13 :      ... >= 200 DM /
		salary assignments for at least 1 year
		
        A14 : no checking account

  Savings account/bonds

	      A61 :          ... <  100 DM

	      A62 :   100 <= ... <  500 DM

	      A63 :   500 <= ... < 1000 DM

	      A64 :          .. >= 1000 DM
		  
              A65 :   unknown/ no savings account

**We can include checking and savings account feature since it is important for financial institutions to evaluate how much available capital they have**

In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['checking_account'], prefix='checking_account')
df_encoded = pd.get_dummies(df_encoded, columns=['savings_account'], prefix='savings_account')

: 

***Dropping telephone since it is not a determinant factor for credit risk worthiness***

In [None]:
df_encoded = df_encoded.drop(columns='telephone' , axis= 1)

: 

In [None]:
df.groupby('credits_at_current_bank')['class'].sum()

: 

***Double of the amount of people with good credit have only one account open at the current bank in comparison to people with bad credit***

In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['credits_at_current_bank'], prefix='current_bank_credit') 

: 

In [None]:
sns.countplot(data=df_encoded, x = 'housing', hue='class')
plt.title('Class Housing')
plt.show();

: 

	      Housing
	      A151 : rent
	      A152 : own
	      A153 : for free

***Owning a home is an important factor for credit approval and usully homeowners need to have a 'decent' credit to own a home***

In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['housing'], prefix='housing') 

: 

In [None]:
sns.countplot(data=df_encoded, x = 'job', hue = 'class' )
plt.title('Job Category per Class')
plt.show();


: 

   			Job
	      A171 : unemployed/ unskilled  - non-resident
	      A172 : unskilled - resident
	      A173 : skilled employee / official
	      A174 : management/ self-employed/
		     highly qualified employee/ officer

***The distribution for each class and employment seems proportional, we can omit this category***

In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['job'], prefix='job') 

: 

In [None]:
sns.countplot(data=df_encoded, x  = 'property', hue = 'class')
plt.title('Count of Properties per class')
plt.show();

: 

Property

A121 : real estate

A122 : if not A121 : building society savings agreement/life insurance

A123 : if not A121/A122 : car or other, not in attribute 6

A124 : unknown / no property

***Encoding this feature because banks tend to look at collaterals when applying for credit lines***

In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['property'], prefix='property') 

: 

In [None]:
sns.countplot(data=df_encoded, x  = 'status_sex', hue = 'class')
plt.title('Se/Marital Status per Class')
plt.show();

: 

	      Personal status and sex
	      A91 : male   : divorced/separated
	      A92 : female : divorced/separated/married
              A93 : male   : single
	      A94 : male   : married/widowed
	      A95 : female : single

***Encoding this feature becasue you can tell there is a difference betweeing goo/bad credit between male and females***

In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['status_sex'], prefix='status_sex') 

: 

In [None]:
sns.countplot(data=df_encoded, x  = 'present_employment', hue = 'class')

: 

      A71 : unemployed
	      A72 :       ... < 1 year
	      A73 : 1  <= ... < 4 years  
	      A74 : 4  <= ... < 7 years
	      A75 :       .. >= 7 years

***Encoding this feature becasue it is important to have a working history that is stable, it could determine you eligibility for a credit line (domain knowledge)***

In [None]:
df_encoded = pd.get_dummies(df_encoded, columns=['present_employment'], prefix='years_in employment') 

: 

In [None]:
sns.countplot(data=df_encoded, x  = 'other_installments', hue = 'class')

: 

     Other installment plans 
	      A141 : bank
	      A142 : stores
	      A143 : none

***Having Other Installments does not seem that affect wether you have good or bad credit,since the population results look proportional***

In [None]:
#df_encoded = pd.get_dummies(df_encoded, columns=['other_installments'], prefix='other_installments') 
df_encoded = df_encoded.drop(columns='other_installments', axis= 1)

: 

In [None]:
pd.set_option('display.max_columns', None)

df_encoded.head(1)

: 

## Machine Learning

In [None]:
#1 is Good, 0 is bad
mapping_dict = {1: 1, 2: 0}
df_encoded['class'] = df_encoded['class'].map(mapping_dict)

#Seperating Features from Target
x = df_encoded.drop(columns='class',axis =1)
y = df_encoded['class']

: 

## Naive Bayes Algorithm With Standardization

In [None]:
#Splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= .60, random_state= 42)

#Standardizing
scaler = StandardScaler()
xtrain_scale = scaler.fit_transform(x_train)
xtest = scaler.transform(x_test)

# Hyperparameter Tuning
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(xtrain_scale, y_train)

# Training
n_b_best = grid_search.best_estimator_
n_b_best.fit(xtrain_scale, y_train)
predictions = n_b_best.predict(xtest)

#Performance
scores = n_b_best.score(xtest, y_test)
conf_mtrx = confusion_matrix(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mtrx)

#Plotting ROC Curve
y_pred_proba = n_b_best.predict_proba(xtest)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)

#Results
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve(Standardized) (area = {round(roc_auc,3)})')
plt.fill_between(fpr, tpr, color='gray', alpha=0.3) 
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show();
print('Confusion Matrix')
disp.plot(cmap='Blues', include_values=True)
plt.show();

print('---------------------------')
print('Performance Measures')
print('---------------------------')
print(f'Precision:, {precision}')
print(f'Recall:, {recall}')
print(f'F1 Score:, {f1}')
print(f'Accuracy Score: {scores}')
print('-----------------------------')
print('Hyperparameter Tuning Results')
print('----------------------------')
print(f'Best parameters:, {grid_search.best_params_}')
print(f'Best score:", {grid_search.best_score_}')
print(f'False Positive rate: {conf_mtrx[0][1] / (conf_mtrx[0][1]+conf_mtrx[0][0])}')

: 

***Observation***

- The precision on this model is very good buta bit "off-balance" with the Recall score, we are trying to minimize the False Positives and this model tends to do that as well. The problem can arise from the imbalanced data.

### Simulating Best Score

In [None]:
i = 0

n_b_train_score = []
n_b_test_score = []
diff = []

while i < 200:

    # Splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=.60)

    # Standardizing
    scaler = StandardScaler()
    xtrain_scale = scaler.fit_transform(x_train)
    xtest = scaler.transform(x_test)

    # Hyperparameter Tuning
    param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
    grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
    grid_search.fit(xtrain_scale, y_train)

    # Training
    n_b_best = grid_search.best_estimator_
    n_b_best.fit(xtrain_scale, y_train)
    predictions = n_b_best.predict(xtest)

    # Performance
    train_score = n_b_best.score(xtrain_scale, y_train)
    test_score = n_b_best.score(xtest, y_test)
    n_b_train_score.append(train_score)
    n_b_test_score.append(test_score)
    diff_Score = train_score - test_score

    diff.append(diff_Score)

    i +=1 


sns.histplot(data= n_b_test_score, color = 'red', label = 'Test')
sns.histplot(data= n_b_train_score, color = 'blue', label ='Training')
plt.title('Naive-Bayes Scores Distribution(Standardized)')
plt.legend()
plt.show();
print(f'Mean Train Score: {np.mean(n_b_train_score)}')
print(f'Mean Test Score: {np.mean(n_b_test_score)}')
print(f'Mean Diff score: {np.mean(diff)}')


: 

In [None]:
sns.histplot(data= diff, color = 'red')
plt.title('Training and Test Score Difference')
plt.show();


: 

***Observation***
- From the simulations we can see that our score is very large spread and it is skewed to the left, the mean difference of our score is very close to zero indicating that our model is performing well, but like I mentioned before, it could be that the model is not learning well due to the imblanced dataset, the spread is quite high as well and the difference of score having a negative means that the model is performing  better  on the test stet rather than the training set which is preferable.

## Naive Bayes Algorithm with Balanced Target and Standardized

In [None]:
#Splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= .60, random_state= 42)

#Balancing
smote = SMOTE()
x_train_, y_train_ =  smote.fit_resample(x_train,y_train)

#Standardizing
scaler = StandardScaler()
xtrain_scale = scaler.fit_transform(x_train_)
xtest = scaler.transform(x_test)

# Hyperparameter Tuning
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(xtrain_scale, y_train_)

# Training
n_b_best = grid_search.best_estimator_
n_b_best.fit(xtrain_scale, y_train_)
predictions= n_b_best.predict(xtest)

#Performance
scores = n_b_best.score(xtest, y_test)
conf_mtrx = confusion_matrix(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mtrx)

#Plotting ROC Curve
y_pred_proba = n_b_best.predict_proba(xtest)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)

#Results
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve(Standardized) (area = {round(roc_auc,3)})')
plt.fill_between(fpr, tpr, color='gray', alpha=0.3) 
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show();
print('Confusion Matrix')
disp.plot(cmap='Blues', include_values=True)
plt.show();

print('---------------------------')
print('Performance Measures')
print('---------------------------')
print(f'Precision:, {precision}')
print(f'Recall:, {recall}')
print(f'F1 Score:, {f1}')
print(f'Accuracy Score: {scores}')
print('-----------------------------')
print('Hyperparameter Tuning Results')
print('----------------------------')
print(f'Best parameters:, {grid_search.best_params_}')
print(f'Best score:", {grid_search.best_score_}')
print(f'False Positive rate: {conf_mtrx[0][1] / (conf_mtrx[0][1]+conf_mtrx[0][0])}')

: 

***Observations***

- Precision has dropped but recall has increased, this means that our model has a good proportion of having large true positives while also making correct predictions.  Our Falsse Positive Rate has increased but it is balanced with the Recall score. The AUC is near the 50/50 threshold, it could potentiall cause issue if our model is slightly "guessing" the predictions. 

### Simulating Best Score

In [None]:
i = 0
n_b_train_score = []
n_b_test_score = []
diff = []


while i < 200:

    #Splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= .60)
    
   #Balancing
    smote = SMOTE()
    x_train_, y_train_ =  smote.fit_resample(x_train,y_train)
    
    #Standardizing
    scaler = StandardScaler()
    xtrain_scale = scaler.fit_transform(x_train_)
    xtest = scaler.transform(x_test)
    
    # Hyperparameter Tuning
    param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
    grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
    grid_search.fit(xtrain_scale, y_train_)
    

    # Training
    n_b_best = grid_search.best_estimator_
    n_b_best.fit(xtrain_scale, y_train_)
    predictions = n_b_best.predict(xtest)

    # Performance
    train_score = n_b_best.score(xtrain_scale, y_train_)
    test_score = n_b_best.score(xtest, y_test)
    n_b_train_score.append(train_score)
    n_b_test_score.append(test_score)
    diff_Score = train_score - test_score

    diff.append(diff_Score)

    i +=1 


sns.histplot(data= n_b_test_score, color = 'red', label = 'Test')
sns.histplot(data= n_b_train_score, color = 'blue', label ='Training')
plt.title('Naive-Bayes Score Distribution(Balanced/Standardized)')
plt.legend()
plt.axvline(np.mean(n_b_test_score))
plt.show();

print(f'Mean Train Score: {np.mean(n_b_train_score)}')
print(f'Mean Test Score: {np.mean(n_b_test_score)}')
print(f'Mean Diff score: {np.mean(diff)}')


: 

In [None]:
sns.histplot(data = diff, color ='coral')
plt.title('Training and Test Score Difference')
plt.show();

: 

***Observation***
- In comparison to the previous simulation, this model is doing better on the training data, which could also be a sign of overfitting. 

## Without modifications to the dataset

In [None]:
#Splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= .60, random_state= 42)

# Hyperparameter Tuning
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Training
n_b_best = grid_search.best_estimator_
n_b_best.fit(x_train, y_train)
predictions_ = n_b_best.predict(x_test)

#Performance
scores = n_b_best.score(x_test, y_test)
conf_mtrx = confusion_matrix(y_test, predictions_)
precision = precision_score(y_test, predictions_)
recall = recall_score(y_test, predictions_)
f1 = f1_score(y_test, predictions_)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mtrx)

#Results
print('Confusion Matrix')
disp.plot(cmap='Blues', include_values=True)
plt.show();


#Plotting ROC Curve
y_pred_proba = n_b_best.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)


plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve(Conventional) (area = {round(roc_auc,3)})')
plt.fill_between(fpr, tpr, color='gray', alpha=0.3) 
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show();

print('---------------------------')
print('Performance Measures')
print('---------------------------')
print(f'Precision:, {precision}')
print(f'Recall:, {recall}')
print(f'F1 Score:, {f1}')
print(f'Accuracy Score: {scores}')
print('-----------------------------')
print('Hyperparameter Tuning Results')
print('----------------------------')
print(f'Best parameters:, {grid_search.best_params_}')
print(f'Best score:", {grid_search.best_score_}')
print(f'False Positive rate: {conf_mtrx[0][1] / (conf_mtrx[0][1]+conf_mtrx[0][0])}')

: 

***Observations***

- We got higher results by leaving the model intact, no standardization and no balance dataset. Our AUC is close to .80, a percentage that can be categorized as a good performing model. The ratio of precision and recall or F1 score is above .80 which means that our model is good a predicting a high number of True Positives without sacrificing its accuracy. Overall this model can be a candidate. 

### Simulating Best Score

In [None]:
i = 0
n_b_train_score = []
n_b_test_score = []
diff = []

while i < 200:
    
    #Splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= .60)
    
    # Hyperparameter Tuning
    param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
    grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
    grid_search.fit(x_train, y_train)
    
    # Training
    n_b_best = grid_search.best_estimator_
    n_b_best.fit(x_train, y_train)
    predictions = n_b_best.predict(x_test)

    # Performance
    train_score = n_b_best.score(x_train, y_train)
    test_score = n_b_best.score(x_test, y_test)
    n_b_train_score.append(train_score)
    n_b_test_score.append(test_score)
    diff_score = train_score - test_score
    diff.append(diff_score)

    i +=1 


sns.histplot(data=n_b_test_score, bins=15, label ='Test Score')
sns.histplot(data = n_b_train_score, bins = 15, color = 'coral', label = 'Training')
plt.title('Naive-Bayes Score Distribution(No Balanced/Standardized)')
plt.legend()
plt.axvline(np.mean(n_b_test_score))
plt.show();

print(f'Mean Train Score: {np.mean(n_b_train_score)}')
print(f'Mean Test Score: {np.mean(n_b_test_score)}')
print(f'Mean Diff score: {np.mean(diff)}')


: 

In [None]:
sns.histplot(data = diff, color = 'coral')
plt.title('Training and Test Score Difference')
plt.show();

: 

The difference in scores has a range with negative numbers, again, it is an observation that our models is performing better on our test set rather than our training set. 

## With Balanced Target And No Standardization

In [None]:

#Splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= .60, random_state=42)

#Balancing
smote = SMOTE()
x_train_, y_train_ =  smote.fit_resample(x_train,y_train)


# Hyperparameter Tuning
param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
grid_search.fit(x_train_, y_train_)

# Training
n_b_best = grid_search.best_estimator_
n_b_best.fit(x_train_, y_train_)
predictions_ = n_b_best.predict(x_test)

#Performance
scores = n_b_best.score(x_test, y_test)
conf_mtrx = confusion_matrix(y_test, predictions_)
precision = precision_score(y_test, predictions_)
recall = recall_score(y_test, predictions_)
f1 = f1_score(y_test, predictions_)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mtrx)

#Results
print('Confusion Matrix')
disp.plot(cmap='Blues', include_values=True)
plt.show();


#Plotting ROC Curve
y_pred_proba = n_b_best.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)


plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve(Balanced) (area = {round(roc_auc,3)})')
plt.fill_between(fpr, tpr, color='gray', alpha=0.3) 
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show();

print('---------------------------')
print('Performance Measures')
print('---------------------------')
print(f'Precision:, {precision}')
print(f'Recall:, {recall}')
print(f'F1 Score:, {f1}')
print(f'Accuracy Score: {scores}')
print('-----------------------------')
print('Hyperparameter Tuning Results')
print('----------------------------')
print(f'Best parameters:, {grid_search.best_params_}')
print(f'Best score:", {grid_search.best_score_}')
print(f'False Positive rate: {conf_mtrx[0][1] / (conf_mtrx[0][1]+conf_mtrx[0][0])}')

: 

This model performed very similar to the convential  algorithm except the score were lowere but that ratio is consistent, give then results from the previous model we can discard this one. 

### Simulating Best Score

In [None]:
i = 0
n_b_score = []
n_b_train_score = []
n_b_test_score = []
diff = []

while i < 200:
    
    #Splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size= .60)

    #Balancing
    smote = SMOTE()
    x_train_, y_train_ =  smote.fit_resample(x_train,y_train)


    # Hyperparameter Tuning
    param_grid = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]}
    grid_search = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid, cv=10, scoring='accuracy')
    grid_search.fit(x_train_, y_train_)

    # Training
    n_b_best = grid_search.best_estimator_
    n_b_best.fit(x_train_, y_train_)
    predictions_ = n_b_best.predict(x_test)
    
    # Performance
    train_score = n_b_best.score(x_train, y_train)
    test_score = n_b_best.score(x_test, y_test)
    n_b_train_score.append(train_score)
    n_b_test_score.append(test_score)
    diff_score = train_score - test_score
    diff.append(diff_score)

    i +=1 


sns.histplot(data=n_b_test_score, bins=15, label ='Test Score')
sns.histplot(data = n_b_train_score, bins = 15, color = 'coral', label = 'Training')
plt.title('Naive-Bayes Score Distribution(Balanced)')
plt.legend()
plt.axvline(np.mean(n_b_test_score))
plt.show();

print(f'Mean Train Score: {np.mean(n_b_train_score)}')
print(f'Mean Test Score: {np.mean(n_b_test_score)}')
print(f'Mean Diff score: {np.mean(diff)}')

: 

: 

In [None]:
sns.histplot(data = diff, color = 'coral')
plt.title('Training and Test Score Difference')
plt.show();

: 

The abrupt cut in the histogram to negative values means that the model can have a high level of variability in its results. Although in some instances it performs better in the test set, the count tends to be low therefore the model can have problems adapting to new data.

### Gradient Boosting Machines

In [None]:
# Splitting
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)

# Balancing
smote = SMOTE()
X_resampled, Y_resampled = smote.fit_resample(x_train, y_train)


# Hyperparameter Tuning for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting stages to be run
    'learning_rate': [0.01, 0.1, 0.2],  # Shrinks the contribution of each tree by learning_rate
    'max_depth': [3, 4, 5]  # Maximum depth of the individual regression estimators
}
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_resampled, Y_resampled)

# Training with the best parameters found
gbm_best = grid_search.best_estimator_
gbm_best.fit(X_resampled, Y_resampled)
predictions = gbm_best.predict(x_test)

# Performance evaluation
scores = gbm_best.score(x_test, y_test)
conf_mtrx = confusion_matrix(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

# Confusion Matrix Display
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mtrx)
print('Confusion Matrix')
disp.plot(cmap='Blues', include_values=True)
plt.show()

# Plotting ROC Curve
y_pred_proba = gbm_best.predict_proba(x_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba, pos_label=1)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (GBM) (area = {round(roc_auc, 3)})')
plt.fill_between(fpr, tpr, color='gray', alpha=0.3)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

print('---------------------------')
print('Performance Measures')
print('---------------------------')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Accuracy Score: {scores}')
print('-----------------------------')
print('Hyperparameter Tuning Results')
print('----------------------------')
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')
print(f'False Positive rate: {conf_mtrx[0][1] / (conf_mtrx[0][1]+conf_mtrx[0][0])}')

: 

In [None]:
gbm_scores = []
gbm_train_scores = []
gbm_precisions = []
gbm_recalls = []
gbm_f1_scores = []
gbm_conf_matrices = []
diff_scores = []  

n_iterations = 200

for i in range(n_iterations):
    # Splitting
    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)

    # Balancing
    smote = SMOTE()
    X_resampled, Y_resampled = smote.fit_resample(x_train, y_train)

    # Hyperparameter Tuning for Gradient Boosting
    param_grid = {
        'n_estimators': [100, 200, 300],  
        'learning_rate': [0.01, 0.1, 0.2], 
        'max_depth': [3, 4, 5]  
    }
    grid_search = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_resampled, Y_resampled)

    # Training with the best parameters found
    gbm_best = grid_search.best_estimator_
    gbm_best.fit(X_resampled, Y_resampled)
    train_score = gbm_best.score(X_resampled, Y_resampled)
    predictions = gbm_best.predict(x_test)

    # Performance evaluation
    test_score = gbm_best.score(x_test, y_test)
    conf_matrix = confusion_matrix(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)

    # Store metrics
    gbm_scores.append(test_score)
    gbm_train_scores.append(train_score)
    gbm_precisions.append(precision)
    gbm_recalls.append(recall)
    gbm_f1_scores.append(f1)
    gbm_conf_matrices.append(conf_matrix)

    # Calculate and store differences
    diff_scores.append(train_score - test_score)

# Plotting the differences
sns.histplot(data=diff_scores, color='coral')
plt.title('Training and Test Score Difference')
plt.show()

: 