<a href="https://colab.research.google.com/github/panagiotismouts/machinelearning/blob/main/cost_sensitive_learning/Assignment_1_Advanced_Topics_in_Machine_Learning_Part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Assignment 1 - Part 1



Andreas Kiziridis - Erasmus Student

Moutsiounas Panagiotis - 153


In this task, we will be using one technic of sampling, weighting and of expected cost minimization. First, we  will train the dataset in the three different algorithms, providing a cost matrix for each to calculate the metrics without applying any techniques. 

In [8]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from collections import Counter
from sklearn.datasets import  fetch_openml
from sklearn.compose import make_column_transformer, make_column_selector

X, y = fetch_openml("credit-g", version=1, as_frame=True, parser='auto', return_X_y=True)

data_names = X.columns

one_hot_encoder = make_column_transformer(
    (OneHotEncoder(sparse_output=False, handle_unknown='ignore'),
     make_column_selector(dtype_include='category')),
    remainder='passthrough')

X = one_hot_encoder.fit_transform(X)
scaler = StandardScaler()
X = scaler.fit_transform(X)

print(Counter(y))

Counter({'good': 700, 'bad': 300})


In [28]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from collections import Counter
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# create a cost-matrix as in slides
cost_m = [[0, 1], 
          [5, 0]]

#training the data
names = ['random forest', 'linear SVM', 'gaussian naïve bayes']

classifiers = [RandomForestClassifier(n_estimators=150, random_state=42), 
               SVC(kernel='linear'), GaussianNB()]

for name, clf in zip(names, classifiers):
  print(" ")
  print(name)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['bad', 'good']))
  conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
  print(conf_m) 
  print("total cost: ", np.sum(conf_m * cost_m))




 
random forest
              precision    recall  f1-score   support

        good       0.69      0.37      0.49        91
         bad       0.77      0.93      0.84       209

    accuracy                           0.76       300
   macro avg       0.73      0.65      0.66       300
weighted avg       0.75      0.76      0.73       300

[[ 34  15]
 [ 57 194]]
total cost:  300
 
linear SVM
              precision    recall  f1-score   support

        good       0.66      0.48      0.56        91
         bad       0.80      0.89      0.84       209

    accuracy                           0.77       300
   macro avg       0.73      0.69      0.70       300
weighted avg       0.76      0.77      0.76       300

[[ 44  23]
 [ 47 186]]
total cost:  258
 
gaussian naïve bayes
              precision    recall  f1-score   support

        good       0.51      0.74      0.60        91
         bad       0.86      0.69      0.76       209

    accuracy                           0.70       

For the first technic, we will use a cost-based sampling technic. Because our dataset is small, we decided to use oversampling. If we would want to save computation time, perhaps we could use undersampling.



In [31]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter


print("Counter before oversampling: ",Counter(y_train))
#from the counter occurs:
#'good': 491, 'bad': 209
#so we decided oversample the data with "bad" on the target value, because the cost is higher (5x)
#and most classes have "good" as their target value.

#training the data
sampler = RandomOverSampler(sampling_strategy={'good':491 , 'bad': 491}, random_state=42) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print("Counter after oversampling:",Counter(y_rs))


for name, clf in zip(names, classifiers):
  print(" ")
  print(name)
  clf.fit(X_rs, y_rs)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['bad', 'good']))
  conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
  print(conf_m) 
  print("total cost: ", np.sum(conf_m * cost_m))

#training the data
sampler = RandomUnderSampler(sampling_strategy={'good':209 , 'bad': 209}, random_state=42) 
X_rs, y_rs = sampler.fit_resample(X_train, y_train)
print("Counter after undersampling:",Counter(y_rs))


for name, clf in zip(names, classifiers):
  print(" ")
  print(name)
  clf.fit(X_rs, y_rs)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['bad', 'good']))
  conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
  print(conf_m) 
  print("total cost: ", np.sum(conf_m * cost_m))


# By applying the sampling techniques we could minimize our costs significantly.
# We noticed that undersampling combined with Random Forest and Linear SVM produced the best results (cost of 180 and 195).
# For Naive Bayes the cost could be reduced to 180/181 by over and undersampling.

Counter before oversampling:  Counter({'good': 491, 'bad': 209})
Counter after oversampling: Counter({'good': 491, 'bad': 491})
 
random forest
              precision    recall  f1-score   support

        good       0.60      0.48      0.54        91
         bad       0.79      0.86      0.83       209

    accuracy                           0.75       300
   macro avg       0.70      0.67      0.68       300
weighted avg       0.74      0.75      0.74       300

[[ 44  29]
 [ 47 180]]
total cost:  264
 
linear SVM
              precision    recall  f1-score   support

        good       0.50      0.67      0.58        91
         bad       0.83      0.71      0.77       209

    accuracy                           0.70       300
   macro avg       0.67      0.69      0.67       300
weighted avg       0.73      0.70      0.71       300

[[ 61  60]
 [ 30 149]]
total cost:  210
 
gaussian naïve bayes
              precision    recall  f1-score   support

        good       0.48      0.

After using oversampling and undersampling and recording the differences in the cost, we will proceed with weighting.

In [35]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np


weights = np.zeros(y_train.shape[0])
weights[np.where(y_train == 'good')] = 1;
weights[np.where(y_train == 'bad')] = 5;

for name, clf in zip(names, classifiers):
  print(" ")
  print(name)
  clf.fit(X_train, y_train, weights)
  y_pred = clf.predict(X_test)
  print(classification_report(y_test, y_pred, target_names=['bad', 'good']))
  conf_m = confusion_matrix(y_test, y_pred).T # transpose to align with slides
  print(conf_m) 
  print("total cost: ", np.sum(conf_m * cost_m))

#after using weights accordingly with the cost matrix, we can conclude that:
#the random forest gets worse results. (300 - 337)
#the linear svm gets significantly better (258 - 135)
#the gaussian naïve bayes also gets better (185 - 164)

 
random forest
              precision    recall  f1-score   support

        good       0.68      0.29      0.40        91
         bad       0.75      0.94      0.84       209

    accuracy                           0.74       300
   macro avg       0.72      0.61      0.62       300
weighted avg       0.73      0.74      0.71       300

[[ 26  12]
 [ 65 197]]
total cost:  337
 
linear SVM
              precision    recall  f1-score   support

        good       0.46      0.92      0.61        91
         bad       0.94      0.52      0.67       209

    accuracy                           0.64       300
   macro avg       0.70      0.72      0.64       300
weighted avg       0.79      0.64      0.65       300

[[ 84 100]
 [  7 109]]
total cost:  135
 
gaussian naïve bayes
              precision    recall  f1-score   support

        good       0.47      0.82      0.60        91
         bad       0.89      0.60      0.71       209

    accuracy                           0.67       

Minimizing expected cost

In [44]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import pandas as pd
import numpy as np


#first we will minimize it without probability calibration
#we need to have numerical values in our target column to perform the matrix multiplication
#and calculate the probabilities.

label_map = {"good": 0, "bad": 1}

y_train_num = [label_map[c] for c in y_train]
y_test_num = [label_map[c] for c in y_test]

classifiers = [RandomForestClassifier(n_estimators=150, random_state=42), 
               SVC(kernel='linear', probability=True), GaussianNB()]

for name, clf in zip(names, classifiers):
  print(" ")
  print(name)
  model = clf.fit(X_train, y_train_num)

  y_pred_prob = model.predict_proba(X_test)

  y_pred = np.argmin(np.matmul(y_pred_prob, np.array(cost_m).T), axis=1) 
  print(classification_report(y_test_num, y_pred, target_names=['bad', 'good']))
  conf_m = confusion_matrix(y_test_num, y_pred).T
  print(conf_m) 
  print("total cost: ", np.sum(conf_m * cost_m))







 
random forest
              precision    recall  f1-score   support

         bad       0.70      1.00      0.83       209
        good       1.00      0.03      0.06        91

    accuracy                           0.71       300
   macro avg       0.85      0.52      0.44       300
weighted avg       0.79      0.71      0.59       300

[[209  88]
 [  0   3]]
total cost:  88
 
linear SVM
              precision    recall  f1-score   support

         bad       0.70      1.00      0.82       209
        good       0.00      0.00      0.00        91

    accuracy                           0.70       300
   macro avg       0.35      0.50      0.41       300
weighted avg       0.49      0.70      0.57       300

[[209  91]
 [  0   0]]
total cost:  91
 
gaussian naïve bayes
              precision    recall  f1-score   support

         bad       0.84      0.79      0.82       209
        good       0.58      0.65      0.61        91

    accuracy                           0.75       30

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
