In [1]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_from_disk
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, plot_roc_curve
from sklearn.tree import DecisionTreeClassifier
from argparse import Namespace

In [2]:
config = {
    "train_subset": 15000,
    "valid_subset": 4000,
    "test_subset" : 2000,
    "seed": 42
}

args = Namespace(**config)

### Read the dataset

In [3]:
violence_hidden = load_from_disk("../../Violence_data/geo_corpus.0.0.1_datasets_hidden_small_labse")

In [None]:
violence_hidden

In [4]:
# Remove unncesary columns
keep_cols = ['hidden_state', 'pre7geo10', 'pre7geo30', 
             'pre7geo50', 'post7geo10', 'post7geo30', 'post7geo50']
remove_columns = [col for col in violence_hidden['train'].column_names if col not in keep_cols]

In [5]:
violence_hidden = violence_hidden.remove_columns(remove_columns)

In [None]:
violence_hidden

In [None]:
# Extract a subset of the dataset
train_clf_ds = violence_hidden["train"].shuffle(args.seed).select(range(args.train_subset))
validation_clf_ds = violence_hidden["validation"].shuffle(args.seed).select(range(args.valid_subset))
test_clf_ds = violence_hidden["test"].shuffle(args.seed).select(range(args.test_subset))

In [None]:
train_clf_ds[0]

### Preprocess the labels

In [None]:
# We need to to cast integer labels to float in order to calculate the Binary Cross
# Entropy loss during training
from datasets import Value
new_features = train_clf_ds.features.copy()
new_features['post7geo10'] = Value(dtype='float32')
new_features['post7geo30'] = Value(dtype='float32')
new_features['post7geo50'] = Value(dtype='float32')
new_features['pre7geo10'] = Value(dtype='float32')
new_features['pre7geo30'] = Value(dtype='float32')
new_features['pre7geo50'] = Value(dtype='float32')
train_clf_ds = train_clf_ds.cast(new_features)
validation_clf_ds = validation_clf_ds.cast(new_features)
test_clf_ds = test_clf_ds.cast(new_features)

In [None]:
# create 'labels' columm for each split
cols = train_clf_ds.column_names
# Train split
train_clf_ds = train_clf_ds.map(lambda x : {"labels": [x[c] for c in cols if c != "hidden_state"]})
# Validation split
validation_clf_ds = validation_clf_ds.map(lambda x : {"labels": [x[c] for c in cols if c != "hidden_state"]})
# Test split
test_clf_ds = test_clf_ds.map(lambda x : {"labels": [x[c] for c in cols if c != "hidden_state"]})

In [None]:
# Sanity check
validation_clf_ds[0]

In [None]:
# remove all columns except hidden_states and labels
col_names = train_clf_ds.column_names
col_names.remove("labels")
col_names.remove('hidden_state')

In [None]:
train_clf_ds = train_clf_ds.remove_columns(col_names)
validation_clf_ds = validation_clf_ds.remove_columns(col_names)
test_clf_ds = test_clf_ds.remove_columns(col_names)
train_clf_ds

### Create a feature matrix

In [None]:
%time X_train = np.array(train_clf_ds["hidden_state"])
y_train = np.array(train_clf_ds["labels"])
X_validation = np.array(validation_clf_ds["hidden_state"])
y_validation = np.array(validation_clf_ds["labels"])
X_test = np.array(test_clf_ds["hidden_state"])
y_test = np.array(test_clf_ds["labels"])
X_train.shape

In [None]:
print(np.min(X_train[10]))
print(np.max(X_train[10]))

## Helper Functions

In [None]:
# Create a function to report the various metrics for each classifier
def metricsReport(modelName, test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)
    
    roc_auc = roc_auc_score(test_labels, predictions, average = "weighted")

    weighted_precision = precision_score(test_labels, predictions, average='weighted')
    weighted_recall = recall_score(test_labels, predictions, average='weighted')
    weighted_f1 = f1_score(test_labels, predictions, average='weighted')

    micro_precision = precision_score(test_labels, predictions, average='micro')
    micro_recall = recall_score(test_labels, predictions, average='micro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')
    
    
    print("------" + modelName + " Model Metrics-----")
    print("Accuracy: {:.4f}\nroc_auc_score: {:.4f}\nPrecision:\n  - Weighted: {:.4f}\n  - Micro: {:.4f}\nRecall:\n  - Weighted: {:.4f}\n  - Micro: {:.4f}\nF1-measure:\n  - Weighted: {:.4f}\n  - Micro: {:.4f}"\
          .format(accuracy, roc_auc, weighted_precision, micro_precision, weighted_recall, micro_recall, weighted_f1, micro_f1))
    ModelsPerformance[modelName] = {"accuracy": format(accuracy, '.4f'), "roc_auc": format(roc_auc, '.4f'), "weighted_precision": format(weighted_precision, '.4f'),
                                    "weighted_recall": format(weighted_recall, '.4f'), "weighted_f1": format(weighted_f1, '.4f'), "micro_precision": format(micro_precision, '.4f'),
                                    "micro_recall": format(micro_recall, '.4f'), "micro_f1": format(micro_f1, '.4f')}

In [None]:
def plot_confusion_matrix(y_test, y_pred, clf:str):
    f, axes = plt.subplots(2, 3, figsize=(25, 15))
    f.suptitle(clf, fontsize=36)
    axes = axes.ravel()
    for i in range(6):
        labels=['post7geo10', 'post7geo30', 'post7geo50','pre7geo10','pre7geo30', 'pre7geo50']
        disp = ConfusionMatrixDisplay(confusion_matrix(y_test[:, i],
                                                       y_pred[:, i]),
                                      display_labels=[0, i])
        disp.plot(ax=axes[i], values_format='.4g')
        disp.ax_.set_title(labels[i])
        if i<10:
            disp.ax_.set_xlabel('')
        if i%5!=0:
            disp.ax_.set_ylabel('')
        disp.im_.colorbar.remove()

    plt.subplots_adjust(wspace=0.10, hspace=0.1)
    f.colorbar(disp.im_, ax=axes)
    plt.show()

# Classifiers

## Problem transformation

### Bagging

In [None]:
bagClassifier = OneVsRestClassifier(BaggingClassifier(n_jobs=-1, 
                                                      random_state=args.seed))
%time bagClassifier.fit(X_train, y_train)


In [None]:
%time bagPreds = bagClassifier.predict(X_test)

### Boosting

In [None]:
boostClassifier = OneVsRestClassifier(GradientBoostingClassifier(
    random_state=args.seed), n_jobs=-1)
%time boostClassifier.fit(X_train, y_train)

In [None]:
%time boostPreds = boostClassifier.predict(X_test)

### OneVsRest - Support Vector Machine

In [None]:
# Standardize the data for SVM
X_train_scaled = StandardScaler().fit_transform(X_train)
X_validation_scaled = StandardScaler().fit_transform(X_validation)
X_test_scaled = StandardScaler().fit_transform(X_test)

In [None]:
svmClassifier = OneVsRestClassifier(LinearSVC(
    random_state=args.seed, max_iter=7000, 
    dual=False, tol=1e-5), n_jobs=-1)
%time svmClassifier.fit(X_train_scaled, y_train)

In [None]:
%time svmPreds = svmClassifier.predict(X_test)

### Binary Relevance - Support Vector Machine

In [None]:
BinaryRelSVC = BinaryRelevance(LinearSVC(random_state=args.seed,
                                         max_iter=7000, dual=False, 
                                         tol=1e-5),
                                         require_dense = [True, True])
%time BinaryRelSVC.fit(X_train_scaled, y_train)

In [None]:
%time BinRelSVCPreds = BinaryRelSVC.predict(X_test).toarray()

## Problem Adaptation

### Decision Trees

In [None]:
dtClassifier = DecisionTreeClassifier(random_state=args.seed)
%time dtClassifier.fit(X_train, y_train)

In [None]:
%time dtPreds = dtClassifier.predict(X_test)

### Random Forest

In [None]:
rfClassifier = RandomForestClassifier(n_jobs=-1, 
                                      random_state=args.seed)
%time rfClassifier.fit(X_train, y_train)

In [None]:
%time rfPreds = rfClassifier.predict(X_test)

# Model Evaluation

### Evaluate classifiers

In [None]:
ModelsPerformance = {}
metricsReport("Bagging", y_test, bagPreds)
metricsReport("Boosting", y_test, boostPreds)
metricsReport("SVM-OVR", y_test, svmPreds)
metricsReport("SVM-BR", y_test, BinRelSVCPreds)
metricsReport("Decision Tree", y_test, dtPreds)
metricsReport("Random Forest", y_test, rfPreds)


In [None]:
print(" Model Name " + " "*3 + "| Acc.   " + " "*1 + "| ROC_AUC " + " "*1 + "| weighted_prec" + " "*1 + "| weighted_rec" + " "*1 + "| weighted_f1" + " "*1 + "| micro_prec" + " "*1 + "| micro_rec" + " "*1 + "| micro_f1")
print("--------------------------------------------------------------------------------------------------------------------")
for key, value in ModelsPerformance.items():
    print(" " + key, " "*(13-len(key)) + "|", value["accuracy"], " "*(1) + "|", value["roc_auc"], " "*(2) + "|", value["weighted_precision"], " "*(7) + "|", value["weighted_recall"], " "*(6) + "|", value["weighted_f1"], " "*(5) + "|", value["micro_precision"], " "*(4) + "|", value["micro_recall"], " "*(3) + "|", value["micro_f1"])
    print("--------------------------------------------------------------------------------------------------------------------")

In [None]:
# Optional
print(classification_report(rfPreds, y_test, target_names=['post7geo10', 'post7geo30', 'post7geo50',
                                                     'pre7geo10','pre7geo30', 'pre7geo50']))

### Plot Confusion Matrix

In [None]:
# Bagging Classifier
plot_confusion_matrix(y_test, bagPreds, "Bagging Classifier")

In [None]:
# Boosting Classifier
plot_confusion_matrix(y_test, boostPreds, "Boosting Classifier")

In [None]:
# SVM-OVR Classifier
plot_confusion_matrix(y_test, svmPreds, "SVM - One vs Rest")

In [None]:
# SVM - Binary Relevance Classifier
plot_confusion_matrix(y_test, BinRelSVCPreds, "SVM - Binary Relevance")

In [None]:
# Decision Trees Classifier
plot_confusion_matrix(y_test, dtPreds, "Decision Tree")

In [None]:
# Random Forest Classifier
plot_confusion_matrix(y_test, rfPreds, "Random Forest Classifier")

### Plot ROC Curve for the best classifier

In [None]:
# Plot ROC Curve for the best classifier (RF)
label_names = ['post7geo10', 'post7geo30', 'post7geo50', 'pre7geo10','pre7geo30', 'pre7geo50']
labelPlots ={}
for i in range (len(label_names)):
    rfClassifier.fit(X_train, y_train[:,i])
    ax = plt.gca()
    labelPlots[i]= plot_roc_curve(rfClassifier, X_test, y_test[:,i], name= label_names[i], ax=ax, alpha=0.8) 

plt.title("ROC comparison per class label - RF classifier")
plt.show()