Loading the Data

In [1]:
import pandas as pd
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [4]:


# Load the data
data = pd.read_csv('data/data file 4/data_1_1_der.csv')

# Define features and target
features = data.columns.difference(['dgp_name', 'prov_char'])
X = data[features]
y = data['dgp_name']
groups = data['prov_char']

# Encode the target if necessary
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


RF

In [21]:
# Define the classifier
clf = RandomForestClassifier()

# Define the LOGO-CV strategy
logo = LeaveOneGroupOut()

# Initialize lists to store results
y_true_all = []
y_pred_all = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    y_true_all.extend(y_test)
    y_pred_all.extend(y_pred)

# Classification report_rf and confusion matrix
report_rf = classification_report(y_true_all, y_pred_all, target_names=label_encoder.classes_)
conf_matrix_rf = confusion_matrix(y_true_all, y_pred_all)

print("Classification Report Random Forest:\n", report_rf)
print("Confusion Matrix RF:\n", conf_matrix_rf)

Classification Report Random Forest:
               precision    recall  f1-score   support

     Group 1       0.81      0.69      0.74       141
     Group 2       0.73      0.84      0.78       141

    accuracy                           0.76       282
   macro avg       0.77      0.76      0.76       282
weighted avg       0.77      0.76      0.76       282

Confusion Matrix:
 [[ 97  44]
 [ 23 118]]


SVM

In [8]:
from sklearn.svm import SVC

# Define the classifier
clf_svm = SVC()

# Initialize lists to store results
y_true_all_svm = []
y_pred_all_svm = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf_svm.fit(X_train, y_train)
    y_pred = clf_svm.predict(X_test)
    
    y_true_all_svm.extend(y_test)
    y_pred_all_svm.extend(y_pred)

# Classification report and confusion matrix
report_svm = classification_report(y_true_all_svm, y_pred_all_svm, target_names=label_encoder.classes_)
conf_matrix_svm = confusion_matrix(y_true_all_svm, y_pred_all_svm)

report_svm, conf_matrix_svm

print("Classification Report SVM:\n", report_svm)
print("Confusion Matrix SVM:\n", conf_matrix_svm)


Classification Report:
               precision    recall  f1-score   support

     Group 1       0.93      0.45      0.61       141
     Group 2       0.64      0.96      0.77       141

    accuracy                           0.71       282
   macro avg       0.78      0.71      0.69       282
weighted avg       0.78      0.71      0.69       282

Confusion Matrix:
 [[ 64  77]
 [  5 136]]


ET

In [9]:
from sklearn.ensemble import ExtraTreesClassifier

# Define the classifier
clf_et = ExtraTreesClassifier()

# Initialize lists to store results
y_true_all_et = []
y_pred_all_et = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf_et.fit(X_train, y_train)
    y_pred = clf_et.predict(X_test)
    
    y_true_all_et.extend(y_test)
    y_pred_all_et.extend(y_pred)

# Classification report and confusion matrix
report_et = classification_report(y_true_all_et, y_pred_all_et, target_names=label_encoder.classes_)
conf_matrix_et = confusion_matrix(y_true_all_et, y_pred_all_et)

report_et, conf_matrix_et

print("Classification Report ET:\n", report_et)
print("Confusion Matrix ET:\n", conf_matrix_et)

Classification Report:
               precision    recall  f1-score   support

     Group 1       0.80      0.64      0.71       141
     Group 2       0.70      0.84      0.77       141

    accuracy                           0.74       282
   macro avg       0.75      0.74      0.74       282
weighted avg       0.75      0.74      0.74       282

Confusion Matrix:
 [[ 90  51]
 [ 22 119]]


kNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

# Define the classifier
clf_knn = KNeighborsClassifier()

# Initialize lists to store results
y_true_all_knn = []
y_pred_all_knn = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf_knn.fit(X_train, y_train)
    y_pred = clf_knn.predict(X_test)
    
    y_true_all_knn.extend(y_test)
    y_pred_all_knn.extend(y_pred)

# Classification report and confusion matrix
report_knn = classification_report(y_true_all_knn, y_pred_all_knn, target_names=label_encoder.classes_)
conf_matrix_knn = confusion_matrix(y_true_all_knn, y_pred_all_knn)

report_knn, conf_matrix_knn

print("Classification Report kNN:\n", report_knn)
print("Confusion Matrix kNN:\n", conf_matrix_knn)


Classification Report:
               precision    recall  f1-score   support

     Group 1       0.77      0.42      0.54       141
     Group 2       0.60      0.87      0.71       141

    accuracy                           0.65       282
   macro avg       0.68      0.65      0.63       282
weighted avg       0.68      0.65      0.63       282

Confusion Matrix:
 [[ 59  82]
 [ 18 123]]


In [11]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the classifier
clf_gbc = GradientBoostingClassifier()

# Initialize lists to store results
y_true_all_gbc = []
y_pred_all_gbc = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf_gbc.fit(X_train, y_train)
    y_pred = clf_gbc.predict(X_test)
    
    y_true_all_gbc.extend(y_test)
    y_pred_all_gbc.extend(y_pred)

# Classification report and confusion matrix
report_gbc = classification_report(y_true_all_gbc, y_pred_all_gbc, target_names=label_encoder.classes_)
conf_matrix_gbc = confusion_matrix(y_true_all_gbc, y_pred_all_gbc)

report_gbc, conf_matrix_gbc

print("Classification Report GBC:\n", report_gbc)
print("Confusion Matrix GBC:\n", conf_matrix_gbc)

Classification Report:
               precision    recall  f1-score   support

     Group 1       0.83      0.68      0.75       141
     Group 2       0.73      0.86      0.79       141

    accuracy                           0.77       282
   macro avg       0.78      0.77      0.77       282
weighted avg       0.78      0.77      0.77       282

Confusion Matrix:
 [[ 96  45]
 [ 20 121]]


LGBM

In [12]:
import lightgbm as lgb

# Define the classifier
clf_lgbm = lgb.LGBMClassifier()

# Initialize lists to store results
y_true_all_lgbm = []
y_pred_all_lgbm = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf_lgbm.fit(X_train, y_train)
    y_pred = clf_lgbm.predict(X_test)
    
    y_true_all_lgbm.extend(y_test)
    y_pred_all_lgbm.extend(y_pred)

# Classification report and confusion matrix
report_lgbm = classification_report(y_true_all_lgbm, y_pred_all_lgbm, target_names=label_encoder.classes_)
conf_matrix_lgbm = confusion_matrix(y_true_all_lgbm, y_pred_all_lgbm)

report_lgbm, conf_matrix_lgbm

print("Classification Report LightGBM:\n", report_lgbm)
print("Confusion Matrix LightGBM:\n", conf_matrix_lgbm)

[LightGBM] [Info] Number of positive: 132, number of negative: 141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22179
[LightGBM] [Info] Number of data points in the train set: 273, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.483516 -> initscore=-0.065958
[LightGBM] [Info] Start training from score -0.065958
[LightGBM] [Info] Number of positive: 141, number of negative: 123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21465
[LightGBM] [Info] Number of data points in the train set: 264, number of used features: 243
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.534091 -> initscore=0.136576
[LightGBM] [Info] Start training from score 0.136576
[LightGBM] [Info] Numb

LDA

In [17]:
# Define the classifier
clf_logreg = LogisticRegression()

# Define the LOGO-CV strategy
logo = LeaveOneGroupOut()

# Initialize lists to store results
y_true_all_logreg = []
y_pred_all_logreg = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf_logreg.fit(X_train, y_train)
    y_pred = clf_logreg.predict(X_test)
    
    y_true_all_logreg.extend(y_test)
    y_pred_all_logreg.extend(y_pred)

# Flipping the predictions for Logistic Regression
y_pred_all_logreg_flipped = [1 if y == 0 else 0 for y in y_pred_all_logreg]

# Classification report and confusion matrix for flipped Logistic Regression
report_logreg_flipped = classification_report(y_true_all_logreg, y_pred_all_logreg_flipped, target_names=label_encoder.classes_)
conf_matrix_logreg_flipped = confusion_matrix(y_true_all_logreg, y_pred_all_logreg_flipped)

# Print the results
print("Classification Report for Flipped Logistic Regression:\n", report_logreg_flipped)
print("Confusion Matrix for Flipped Logistic Regression:\n", conf_matrix_logreg_flipped)

Classification Report for Flipped Logistic Regression:
               precision    recall  f1-score   support

     Group 1       1.00      1.00      1.00       141
     Group 2       1.00      1.00      1.00       141

    accuracy                           1.00       282
   macro avg       1.00      1.00      1.00       282
weighted avg       1.00      1.00      1.00       282

Confusion Matrix for Flipped Logistic Regression:
 [[141   0]
 [  0 141]]


In [18]:
# Define the classifier
clf_lda = LinearDiscriminantAnalysis()

# Define the LOGO-CV strategy
logo = LeaveOneGroupOut()

# Initialize lists to store results
y_true_all_lda = []
y_pred_all_lda = []

# Perform LOGO-CV
for train_index, test_index in logo.split(X, y_encoded, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_encoded[train_index], y_encoded[test_index]
    
    clf_lda.fit(X_train, y_train)
    y_pred = clf_lda.predict(X_test)
    
    y_true_all_lda.extend(y_test)
    y_pred_all_lda.extend(y_pred)

# Flipping the predictions for LDA
y_pred_all_lda_flipped = [1 if y == 0 else 0 for y in y_pred_all_lda]

# Classification report and confusion matrix for flipped LDA
report_lda_flipped = classification_report(y_true_all_lda, y_pred_all_lda_flipped, target_names=label_encoder.classes_)
conf_matrix_lda_flipped = confusion_matrix(y_true_all_lda, y_pred_all_lda_flipped)

# Print the results
print("Classification Report for Flipped LDA:\n", report_lda_flipped)
print("Confusion Matrix for Flipped LDA:\n", conf_matrix_lda_flipped)

Classification Report for Flipped LDA:
               precision    recall  f1-score   support

     Group 1       0.85      0.74      0.80       141
     Group 2       0.77      0.87      0.82       141

    accuracy                           0.81       282
   macro avg       0.81      0.81      0.81       282
weighted avg       0.81      0.81      0.81       282

Confusion Matrix for Flipped LDA:
 [[105  36]
 [ 18 123]]


In [25]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

# Function to display classification report
def display_classification_report(report, model_name):
    print(f"Classification Report for {model_name}")
    print(report)
    print()

# Function to display confusion matrix as a text table
def display_confusion_matrix_text(conf_matrix, class_labels, title):
    df_cm = pd.DataFrame(conf_matrix, index=class_labels, columns=class_labels)
    print(title)
    print(df_cm)
    print()

# Example results from running the classifiers (for demonstration purposes)
results = {
    'Random Forest': {'report': report_rf, 'conf_matrix': conf_matrix_rf},
    'SVM': {'report': report_svm, 'conf_matrix': conf_matrix_svm},
    'Extra Trees': {'report': report_et, 'conf_matrix': conf_matrix_et},
    'k-Nearest Neighbors': {'report': report_knn, 'conf_matrix': conf_matrix_knn},
    'Gradient Boosting': {'report': report_gbc, 'conf_matrix': conf_matrix_gbc},
    'LightGBM': {'report': report_lgbm, 'conf_matrix': conf_matrix_lgbm},
    'Logistic Regression': {'report': report_logreg_flipped, 'conf_matrix': conf_matrix_logreg_flipped},
    'LDA': {'report': report_lda_flipped, 'conf_matrix': conf_matrix_lda_flipped}
}

# Display classification reports and confusion matrices for each classifier
for name, result in results.items():
    display_classification_report(result['report'], name)
    display_confusion_matrix_text(result['conf_matrix'], label_encoder.classes_, f"{name} Confusion Matrix")


Classification Report for Random Forest
              precision    recall  f1-score   support

     Group 1       0.81      0.69      0.74       141
     Group 2       0.73      0.84      0.78       141

    accuracy                           0.76       282
   macro avg       0.77      0.76      0.76       282
weighted avg       0.77      0.76      0.76       282


Random Forest Confusion Matrix
         Group 1  Group 2
Group 1       97       44
Group 2       23      118

Classification Report for SVM
              precision    recall  f1-score   support

     Group 1       0.93      0.45      0.61       141
     Group 2       0.64      0.96      0.77       141

    accuracy                           0.71       282
   macro avg       0.78      0.71      0.69       282
weighted avg       0.78      0.71      0.69       282


SVM Confusion Matrix
         Group 1  Group 2
Group 1       64       77
Group 2        5      136

Classification Report for Extra Trees
              precision   