In [1]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datasets import load_from_disk
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, plot_roc_curve
from sklearn.tree import DecisionTreeClassifier
from argparse import Namespace

In [22]:
config = {
    "train_subset": 1500000, # 1500000
     "valid_subset": 400000, # 400000
    "test_subset" : 200000,  # 200000
    "seed": 42,
    "n_jobs": -1
}

args = Namespace(**config)

### Read the dataset

In [3]:
violence_hidden = load_from_disk("/data4/mmendieta/data/geo_corpus.0.0.1_datasets_hidden_e5_all_labels")

In [None]:
violence_hidden

In [4]:
# Remove unncesary columns
keep_cols = ['hidden_state', 'labels']
remove_columns = [col for col in violence_hidden['train'].column_names if col not in keep_cols]

In [5]:
violence_hidden = violence_hidden.remove_columns(remove_columns)

In [None]:
violence_hidden

In [23]:
# Extract a subset of the dataset
train_clf_ds = violence_hidden["train"].shuffle(args.seed).select(range(args.train_subset))
validation_clf_ds = violence_hidden["validation"].shuffle(args.seed).select(range(args.valid_subset))
test_clf_ds = violence_hidden["test"].shuffle(args.seed).select(range(args.test_subset))

In [None]:
train_clf_ds[0]

### Create a feature matrix

In [24]:
%time X_train = np.array(train_clf_ds["hidden_state"])
y_train = np.array(train_clf_ds["labels"])
X_validation = np.array(validation_clf_ds["hidden_state"])
y_validation = np.array(validation_clf_ds["labels"])
X_test = np.array(test_clf_ds["hidden_state"])
y_test = np.array(test_clf_ds["labels"])
X_train.shape

CPU times: user 22.2 s, sys: 24.6 s, total: 46.8 s
Wall time: 25.4 s


(1500000, 1024)

In [25]:
print(np.min(X_train[10]))
print(np.max(X_train[10]))

-2.9547567
3.7514224


## Helper Functions

In [26]:
def metricsReport(modelName, test_labels, predictions, prediction_probabilities=None):
    """
    Generates and prints a comprehensive classification metrics report.
    This version includes a robust reshaping of probabilities.
    
    Args:
        modelName (str): The name of the model being evaluated.
        test_labels (array-like): True labels.
        predictions (array-like): Binary predictions (0 or 1).
        prediction_probabilities (array-like, optional): Prediction probabilities or scores.
                                                        Required for roc_auc_score.
    """
    accuracy = accuracy_score(test_labels, predictions)
    
    roc_auc_micro = np.nan
    roc_auc_weighted = np.nan

    if prediction_probabilities is not None:
        try:
            # Check the dimensions of the probability array
            if isinstance(prediction_probabilities, list):
                # This is the case where a list of arrays is returned.
                # Stack the positive class probabilities ([:, 1]) to get a (n_samples, n_labels) array
                reshaped_probas = np.hstack([p[:, 1].reshape(-1, 1) for p in prediction_probabilities])
            elif prediction_probabilities.ndim == 3:
                # This is the case where the list was converted to a 3D array (n_labels, n_samples, 2)
                # We need to transpose and take the positive class probabilities.
                reshaped_probas = prediction_probabilities[:, :, 1].T
            elif prediction_probabilities.ndim == 2:
                # This is the expected shape from MultiOutputClassifier.
                # No reshaping needed, but we'll use a copy to be safe.
                reshaped_probas = prediction_probabilities.copy()
            else:
                # Handle any other unexpected shape
                print(f"Warning: Unexpected prediction probability shape: {prediction_probabilities.shape}")
                return # Exit the function if we can't proceed
            
            # Now, use the reshaped probabilities for the ROC AUC calculation
            roc_auc_micro = roc_auc_score(test_labels, reshaped_probas, average="micro")
            roc_auc_weighted = roc_auc_score(test_labels, reshaped_probas, average="weighted")
        except ValueError as e:
            print(f"Warning: Could not calculate ROC AUC. Error: {e}")
            roc_auc_micro = np.nan
            roc_auc_weighted = np.nan
    else:
        print("Warning: prediction_probabilities not provided. ROC AUC scores will be NaN.")

    # ... rest of your metricsReport function ...
    weighted_precision = precision_score(test_labels, predictions, average='weighted', zero_division=0)
    weighted_recall = recall_score(test_labels, predictions, average='weighted', zero_division=0)
    weighted_f1 = f1_score(test_labels, predictions, average='weighted', zero_division=0)
    
    micro_precision = precision_score(test_labels, predictions, average='micro', zero_division=0)
    micro_recall = recall_score(test_labels, predictions, average='micro', zero_division=0)
    micro_f1 = f1_score(test_labels, predictions, average='micro', zero_division=0)
    
    print(f"------ {modelName} Model Metrics -----")
    print(f"Accuracy: {accuracy:.4f}\n"
          f"roc_auc_score_micro: {roc_auc_micro:.4f}\n"
          f"roc_auc_score_weighted: {roc_auc_weighted:.4f}\n"
          f"Precision:\n"
          f"  - Weighted: {weighted_precision:.4f}\n"
          f"  - Micro: {micro_precision:.4f}\n"
          f"Recall:\n"
          f"  - Weighted: {weighted_recall:.4f}\n"
          f"  - Micro: {micro_recall:.4f}\n"
          f"F1-measure:\n"
          f"  - Weighted: {weighted_f1:.4f}\n"
          f"  - Micro: {micro_f1:.4f}")
    
    ModelsPerformance[modelName] = {
        "accuracy": f"{accuracy:.4f}",
        "roc_auc_micro": f"{roc_auc_micro:.4f}",
        "roc_auc_weighted": f"{roc_auc_weighted:.4f}",
        "weighted_precision": f"{weighted_precision:.4f}",
        "weighted_recall": f"{weighted_recall:.4f}",
        "weighted_f1": f"{weighted_f1:.4f}",
        "micro_precision": f"{micro_precision:.4f}",
        "micro_recall": f"{micro_recall:.4f}",
        "micro_f1": f"{micro_f1:.4f}"
    }

In [10]:
def plot_confusion_matrix(y_test, y_pred, clf:str):
    # Your full list of 40 labels
    labels = ['post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70',
              'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70',
              'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70',
              'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70',
              'pre1geo10', 'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70',
              'pre2geo10', 'pre2geo20', 'pre2geo30', 'pre2geo50', 'pre2geo70',
              'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70',
              'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70']

    num_labels = len(labels)
    num_rows = 5
    num_cols = 8 # 5 rows * 8 columns = 40 plots

    # Adjust figure size for 40 plots
    f, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols * 4, num_rows * 4)) # Adjusted figsize dynamically
    f.suptitle(clf, fontsize=36)
    axes = axes.ravel() # Flatten the 2D array of axes for easy iteration

    # Iterate through all 40 labels
    for i in range(num_labels):
        # Calculate confusion matrix for the i-th label (binary)
        cm = confusion_matrix(y_test[:, i], y_pred[:, i])

        # IMPORTANT CORRECTION: display_labels should be [0, 1] for binary classes
        disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                      display_labels=[0, 1])

        disp.plot(ax=axes[i], values_format='.4g') # Plot on the current subplot
        disp.ax_.set_title(labels[i], fontsize=10) # Set title for the current label

        # Remove x-axis label for plots that are not in the bottom row
        if i < (num_rows - 1) * num_cols: # If not in the last row
            disp.ax_.set_xlabel('')
        else: # For plots in the last row, ensure x-labels are visible
             disp.ax_.set_xlabel('Predicted Label', fontsize=8)


        # Remove y-axis label for plots that are not in the first column
        if i % num_cols != 0: # If not in the first column of a row
            disp.ax_.set_ylabel('')
        else: # For plots in the first column, ensure y-labels are visible
            disp.ax_.set_ylabel('True Label', fontsize=8)


        disp.im_.colorbar.remove() # Remove individual colorbars to avoid clutter

    # Adjust layout to prevent overlap
    plt.subplots_adjust(wspace=0.10, hspace=0.1)

    # Add a single colorbar for the entire figure (optional, but good for large grids)
    # This might require some manual adjustment if `disp.im_` is not available after loop
    # For simplicity, if you want a shared colorbar, you might need to plot one outside the loop
    # or keep one of the disp.im_ and pass its norm and cmap to a new colorbar
    # As it's commented out in your original, I'll keep it simple for now and only remove individuals.
    # If a shared colorbar is desired, it's usually better to handle it separately after all plots.
    # f.colorbar(disp.im_, ax=axes) # Removed for simplicity, as it refers to last plot's image

    plt.show()

# Classifiers

## Problem transformation

### Bagging

In [None]:
bagClassifier = OneVsRestClassifier(BaggingClassifier(n_jobs=args.n_jobs, 
                                                      random_state=args.seed))
%time bagClassifier.fit(X_train, y_train)


In [None]:
%time bagPreds = bagClassifier.predict(X_test)

### Boosting

In [None]:
boostClassifier = OneVsRestClassifier(GradientBoostingClassifier(
    random_state=args.seed), n_jobs=-1)
%time boostClassifier.fit(X_train, y_train)

In [None]:
%time boostPreds = boostClassifier.predict(X_test)

### OneVsRest - Support Vector Machine

In [None]:
# Standardize the data for SVM
X_train_scaled = StandardScaler().fit_transform(X_train)
X_validation_scaled = StandardScaler().fit_transform(X_validation)
X_test_scaled = StandardScaler().fit_transform(X_test)

In [None]:
svmClassifier = OneVsRestClassifier(LinearSVC(
    random_state=args.seed, max_iter=7000, 
    dual=False, tol=1e-5), n_jobs=args.n_jobs)
%time svmClassifier.fit(X_train_scaled, y_train)

In [None]:
%time svmPreds = svmClassifier.predict(X_test)

### Binary Relevance - Support Vector Machine

In [None]:
BinaryRelSVC = BinaryRelevance(LinearSVC(random_state=args.seed,
                                         max_iter=7000, dual=False, 
                                         tol=1e-5),
                                         require_dense = [True, True])
%time BinaryRelSVC.fit(X_train_scaled, y_train)

In [None]:
%time BinRelSVCPreds = BinaryRelSVC.predict(X_test).toarray()

## Problem Adaptation

### Decision Trees

In [27]:
dtClassifier = MultiOutputClassifier(
    DecisionTreeClassifier(random_state=args.seed),
    n_jobs=args.n_jobs
)
%time dtClassifier.fit(X_train, y_train)

CPU times: user 8.56 s, sys: 3.72 s, total: 12.3 s
Wall time: 6h 44min 34s


In [None]:
# Do not run this line of code because it is defined below in the evaluation code
%time dtPreds = dtClassifier.predict(X_test)

### Random Forest

In [28]:
rfClassifier = MultiOutputClassifier(RandomForestClassifier(n_jobs=1, 
                                      random_state=args.seed),
                                     n_jobs=args.n_jobs
                                    )
%time rfClassifier.fit(X_train, y_train)

CPU times: user 1min 16s, sys: 2min 27s, total: 3min 43s
Wall time: 10h 16min 37s


In [None]:
# Do not run this line of code because it is defined below in the evaluation code
%time rfPreds = rfClassifier.predict(X_test)

# Model Evaluation

### Evaluate classifiers

In [29]:
# Initialize the dictionary to store performance
ModelsPerformance = {}

# --- Bagging Classifier ---
# bagPreds = bagClassifier.predict(X_test)
# bagProbas = bagClassifier.predict_proba(X_test) # Get prediction probabilities
# metricsReport("Bagging", y_test, bagPreds, bagProbas)

# --- Boosting Classifier (if uncommented and `boostClassifier` is fitted) ---
# boostPreds = boostClassifier.predict(X_test)
# boostProbas = boostClassifier.predict_proba(X_test) # Get prediction probabilities
# metricsReport("Boosting", y_test, boostPreds, boostProbas)

# --- SVM-OVR Classifier ---
# svmPreds = svmClassifier.predict(X_test)
# LinearSVC doesn't have predict_proba, so use decision_function
# svmScores = svmClassifier.decision_function(X_test)
# metricsReport("SVM-OVR", y_test, svmPreds, svmScores)

# --- SVM-BR Classifier (if uncommented and `BinaryRelSVC` is fitted) ---
# Assuming BinaryRelSVC is your BinaryRelevance(LinearSVC(...)) model
# BinRelSVCPreds = BinaryRelSVC.predict(X_test)
# BinRelSVCScores = BinaryRelSVC.decision_function(X_test) # Get decision function scores
# metricsReport("SVM-BR", y_test, BinRelSVCPreds, BinRelSVCScores)

# --- Decision Tree Classifier ---
dtPreds = dtClassifier.predict(X_test)
dtProbas = dtClassifier.predict_proba(X_test) # Get prediction probabilities
metricsReport("Decision Tree", y_test, dtPreds, dtProbas)

# --- Random Forest Classifier ---
rfPreds = rfClassifier.predict(X_test)
rfProbas = rfClassifier.predict_proba(X_test) # Get prediction probabilities
metricsReport("Random Forest", y_test, rfPreds, rfProbas)

------ Decision Tree Model Metrics -----
Accuracy: 0.0193
roc_auc_score_micro: 0.5732
roc_auc_score_weighted: 0.5242
Precision:
  - Weighted: 0.3928
  - Micro: 0.3954
Recall:
  - Weighted: 0.3837
  - Micro: 0.3837
F1-measure:
  - Weighted: 0.3881
  - Micro: 0.3895
------ Random Forest Model Metrics -----
Accuracy: 0.0207
roc_auc_score_micro: 0.7039
roc_auc_score_weighted: 0.5766
Precision:
  - Weighted: 0.5578
  - Micro: 0.6087
Recall:
  - Weighted: 0.2715
  - Micro: 0.2715
F1-measure:
  - Weighted: 0.2716
  - Micro: 0.3755


In [30]:
print(" Model Name " + " "*3 + "| Acc.    " + " "*1 + "| ROC_AUC_Micro " + " "*1 + "| ROC_AUC_Weighted" + " "*1 + "| weighted_prec" + " "*1 + "| weighted_rec" + " "*1 + "| weighted_f1" + " "*1 + "| micro_prec" + " "*1 + "| micro_rec" + " "*1 + "| micro_f1")
print("------------------------------------------------------------------------------------------------------------------------------------") # Adjusted length of separator line
for key, value in ModelsPerformance.items():
    print(" " + key, " "*(13-len(key)) + "|", \
          value["accuracy"], " "*(1) + "|", \
          value["roc_auc_micro"], " "*(2) + "|", \
          value["roc_auc_weighted"], " "*(4) + "|", \
          value["weighted_precision"], " "*(7) + "|", \
          value["weighted_recall"], " "*(6) + "|", \
          value["weighted_f1"], " "*(5) + "|", \
          value["micro_precision"], " "*(4) + "|", \
          value["micro_recall"], " "*(3) + "|", \
          value["micro_f1"])
    print("------------------------------------------------------------------------------------------------------------------------------------") # Adjusted length of separator line

 Model Name    | Acc.     | ROC_AUC_Micro  | ROC_AUC_Weighted | weighted_prec | weighted_rec | weighted_f1 | micro_prec | micro_rec | micro_f1
------------------------------------------------------------------------------------------------------------------------------------
 Decision Tree | 0.0193  | 0.5732   | 0.5242     | 0.3928        | 0.3837       | 0.3881      | 0.3954     | 0.3837    | 0.3895
------------------------------------------------------------------------------------------------------------------------------------
 Random Forest | 0.0207  | 0.7039   | 0.5766     | 0.5578        | 0.2715       | 0.2716      | 0.6087     | 0.2715    | 0.3755
------------------------------------------------------------------------------------------------------------------------------------


In [None]:
# Optional
print(classification_report(rfPreds, y_test, target_names=['post7geo10', 'post7geo30', 'post7geo50',
                                                     'pre7geo10','pre7geo30', 'pre7geo50']))

### Plot Confusion Matrix

In [None]:
# Bagging Classifier
plot_confusion_matrix(y_test, bagPreds, "Bagging Classifier")

In [None]:
# Boosting Classifier
plot_confusion_matrix(y_test, boostPreds, "Boosting Classifier")

In [None]:
# SVM-OVR Classifier
plot_confusion_matrix(y_test, svmPreds, "SVM - One vs Rest")

In [None]:
# SVM - Binary Relevance Classifier
plot_confusion_matrix(y_test, BinRelSVCPreds, "SVM - Binary Relevance")

In [None]:
# Decision Trees Classifier
plot_confusion_matrix(y_test, dtPreds, "Decision Tree")

In [None]:
# Random Forest Classifier
plot_confusion_matrix(y_test, rfPreds, "Random Forest Classifier")

### Plot ROC Curve for the best classifier

In [None]:
# Plot ROC Curve for the best classifier (RF)
label_names = ['post1geo10', 'post1geo20', 'post1geo30', 'post1geo50', 'post1geo70',
              'post2geo10', 'post2geo20', 'post2geo30', 'post2geo50', 'post2geo70',
              'post3geo10', 'post3geo20', 'post3geo30', 'post3geo50', 'post3geo70',
              'post7geo10', 'post7geo20', 'post7geo30', 'post7geo50', 'post7geo70',
              'pre1geo10', 'pre1geo20', 'pre1geo30', 'pre1geo50', 'pre1geo70',
              'pre2geo10', 'pre2geo20', 'pre2geo30', 'pre2geo50', 'pre2geo70',
              'pre3geo10', 'pre3geo20', 'pre3geo30', 'pre3geo50', 'pre3geo70',
              'pre7geo10', 'pre7geo20', 'pre7geo30', 'pre7geo50', 'pre7geo70']
labelPlots ={}
for i in range (len(label_names)):
    rfClassifier.fit(X_train, y_train[:,i])
    ax = plt.gca()
    labelPlots[i]= plot_roc_curve(rfClassifier, X_test, y_test[:,i], name= label_names[i], ax=ax, alpha=0.8) 

plt.title("ROC comparison per class label - RF classifier")
plt.show()