Importing libraries

In [None]:
# Importing libraries
import pandas as pd
import sklearn
import numpy  as np
import string
import re

#NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import spacy
from spacy.symbols import nsubj, VERB
nlp = spacy.load("nl_core_news_sm")
#from spacy.lang.nl.stop_words import STOP_WORDS

# model implementation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import tensorflow as tf
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
lemmatizer = nltk.stem.WordNetLemmatizer()

Loading datasets

In [None]:
# Loading dataset
df = pd.read_csv("classification_cleaned_data.csv")

# Loading test sets with manual labels
df_test_q = pd.read_csv("question_training_dataset.csv")
df_test_c = pd.read_csv("concern_training_dataset.csv")
df_test_d = pd.read_csv("doubt_training_dataset.csv")

In [None]:
# Splitting the dataset into training and remaining data
train_data, remaining_data = train_test_split(df, test_size=0.4, random_state=42)  # 60% training, 40% remaining
val_data, test_data = train_test_split(remaining_data, test_size=0.5, random_state=42)  # 20% validation, 20% test

In [None]:
# Checking if everything is string
train_data['clean_text'] = train_data['clean_text'].astype(str)
test_data['clean_text'] = test_data['clean_text'].astype(str)
val_data['clean_text'] = val_data['clean_text'].astype(str)

Taking samples for model training due to computational constraints

In [None]:
# Sample 50 True values for is_question
sample_question = df[df['is_question'] == True].sample(n=50, random_state=42)

# Sample 50 True values for is_concern
sample_concern = df[df['is_concern'] == True].sample(n=50, random_state=42)

# Sample 50 True values for is_doubt
sample_doubt = df[df['is_doubt'] == True].sample(n=50, random_state=42)

# Combine the samples into one dataframe
combined_sample = pd.concat([sample_question, sample_concern, sample_doubt])

# Shuffle the combined dataframe
train_sample = combined_sample.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the combined sample
print(train_sample)

Applying the split on the residual data

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into training and remaining data
train_data, val_data = train_test_split(train_sample, test_size=0.2, random_state=42)  # 60% training, 40% remaining

In [None]:
# Checking datatype
df_test_q['clean_text'] = df_test_q['clean_text'].astype(str)
df_test_c['clean_text'] = df_test_c['clean_text'].astype(str)
df_test_d['clean_text'] = df_test_d['clean_text'].astype(str)
train_data['clean_text'] = train_data['clean_text'].astype(str)
val_data['clean_text'] = val_data['clean_text'].astype(str)

In [None]:
# Retrieving the correct datatypes to implement in the model
X_train= np.stack(train_data['clean_text'].values)
y_train_q = train_data['is_question'].values
y_train_c = train_data['is_concern'].values
y_train_d = train_data['is_doubt'].values

X_test_q = np.stack(df_test_q['clean_text'].values)
X_test_c = np.stack(df_test_c['clean_text'].values)
X_test_d = np.stack(df_test_d['clean_text'].values)
y_test_q = df_test_q['is_question'].values
y_test_c = df_test_c['is_concern'].values
y_test_d = df_test_d['is_doubt'].values

X_val = np.stack(val_data['clean_text'].values)
y_val_q = val_data['is_question'].values
y_val_c = val_data['is_concern'].values
y_val_d = val_data['is_doubt'].values

Question model

In [None]:
# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

# Function to create dataset
def create_dataset(encodings, labels, batch_size=16, shuffle=False):
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    if shuffle:
        dataset = dataset.shuffle(1000)
    dataset = dataset.batch(batch_size)
    return dataset

# Implementing K-Fold Cross-Validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Store results
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")

    # Split data
    X_train_fold = X_train[train_idx]
    X_val_fold = X_train[val_idx]
    y_train_fold = y_train_q[train_idx]
    y_val_fold = y_train_q[val_idx]

    # Tokenize data
    train_encodings_fold = tokenizer(list(X_train_fold), truncation=True, padding=True, max_length=128)
    val_encodings_fold = tokenizer(list(X_val_fold), truncation=True, padding=True, max_length=128)

    # Create TensorFlow datasets
    train_dataset_fold = create_dataset(train_encodings_fold, y_train_fold, shuffle=True)
    val_dataset_fold = create_dataset(val_encodings_fold, y_val_fold, batch_size=64)

    # Reset the model for each fold
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased')

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Add early stopping callback
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, min_delta=0.001, restore_best_weights=True)
    ]

    # Train the model
    history = model.fit(train_dataset_fold,
                        validation_data=val_dataset_fold,
                        epochs=5,
                        callbacks=callbacks)

    # Store metrics
    fold_metrics.append({
        'train_loss': history.history['loss'],
        'val_loss': history.history['val_loss'],
        'train_accuracy': history.history['accuracy'],
        'val_accuracy': history.history['val_accuracy']
    })

# Plotting function
def plot_metrics(metrics, metric_name):
    for fold, metric in enumerate(metrics):
        plt.plot(metric, label=f'Fold {fold + 1}')
    plt.title(f'{metric_name} per Fold')
    plt.xlabel('Epochs')
    plt.ylabel(metric_name)
    plt.legend()
    plt.show()

# Extract and plot the metrics
train_loss_per_fold = [m['train_loss'] for m in fold_metrics]
val_loss_per_fold = [m['val_loss'] for m in fold_metrics]
train_accuracy_per_fold = [m['train_accuracy'] for m in fold_metrics]
val_accuracy_per_fold = [m['val_accuracy'] for m in fold_metrics]

plot_metrics(train_loss_per_fold, 'Training Loss')
plot_metrics(val_loss_per_fold, 'Validation Loss')
plot_metrics(train_accuracy_per_fold, 'Training Accuracy')
plot_metrics(val_accuracy_per_fold, 'Validation Accuracy')

# Evaluation on the test set
X_test_q = np.stack(df_test_q['clean_text'].values)
y_test_q = df_test_q['is_question'].values

# Tokenize the test data
test_texts = list(X_test_q)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)
test_dataset_q = create_dataset(test_encodings, y_test_q, batch_size=64)

# Create and compile the final model for testing
final_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased')

# Compile the final model
final_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])

# Train the final model on the entire training set
final_train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
final_train_dataset = create_dataset(final_train_encodings, y_train_q, shuffle=True)
final_model.fit(final_train_dataset, epochs=5, callbacks=callbacks)

# Evaluate the final model on the test set
logits = final_model.predict(test_dataset_q)
y_preds_q = np.argmax(logits.logits, axis=1)

# Print classification report
print("DistilBERT Classification Report:")
print(classification_report(y_test_q, y_preds_q))

# Plot confusion matrix
cm_q = confusion_matrix(y_test_q, y_preds_q)
sns.heatmap(cm_q, annot=True, fmt='d', cmap='Blues')
plt.title('DistilBERT Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Identify false positives and false negatives
false_positives = np.where((y_preds_q == 1) & (y_test_q == 0))[0]
false_negatives = np.where((y_preds_q == 0) & (y_test_q == 1))[0]

print("\nFalse Positives:")
for idx in false_positives:
    print(test_texts[idx])

print("\nFalse Negatives:")
for idx in false_negatives:
    print(test_texts[idx])

Concern model

In [None]:
# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

# Function to create dataset
def create_dataset(encodings, labels, batch_size=16, shuffle=False):
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    if shuffle:
        dataset = dataset.shuffle(1000)
    dataset = dataset.batch(batch_size)
    return dataset

# Implementing K-Fold Cross-Validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Store results
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")

    # Split data
    X_train_fold = X_train[train_idx]
    X_val_fold = X_train[val_idx]
    y_train_fold = y_train_q[train_idx]
    y_val_fold = y_train_q[val_idx]

    # Tokenize data
    train_encodings_fold = tokenizer(list(X_train_fold), truncation=True, padding=True, max_length=128)
    val_encodings_fold = tokenizer(list(X_val_fold), truncation=True, padding=True, max_length=128)

    # Create TensorFlow datasets
    train_dataset_fold = create_dataset(train_encodings_fold, y_train_fold, shuffle=True)
    val_dataset_fold = create_dataset(val_encodings_fold, y_val_fold, batch_size=64)

    # Reset the model for each fold
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased')

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Add early stopping callback
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, min_delta=0.001, restore_best_weights=True)
    ]

    # Train the model
    history = model.fit(train_dataset_fold,
                        validation_data=val_dataset_fold,
                        epochs=5,
                        callbacks=callbacks)

    # Store metrics
    fold_metrics.append({
        'train_loss': history.history['loss'],
        'val_loss': history.history['val_loss'],
        'train_accuracy': history.history['accuracy'],
        'val_accuracy': history.history['val_accuracy']
    })

# Plotting function
def plot_metrics(metrics, metric_name):
    for fold, metric in enumerate(metrics):
        plt.plot(metric, label=f'Fold {fold + 1}')
    plt.title(f'{metric_name} per Fold')
    plt.xlabel('Epochs')
    plt.ylabel(metric_name)
    plt.legend()
    plt.show()

# Extract and plot the metrics
train_loss_per_fold = [m['train_loss'] for m in fold_metrics]
val_loss_per_fold = [m['val_loss'] for m in fold_metrics]
train_accuracy_per_fold = [m['train_accuracy'] for m in fold_metrics]
val_accuracy_per_fold = [m['val_accuracy'] for m in fold_metrics]

plot_metrics(train_loss_per_fold, 'Training Loss')
plot_metrics(val_loss_per_fold, 'Validation Loss')
plot_metrics(train_accuracy_per_fold, 'Training Accuracy')
plot_metrics(val_accuracy_per_fold, 'Validation Accuracy')

# Evaluation on the test set
X_test_c = np.stack(df_test_c['clean_text'].values)
y_test_c = df_test_c['is_concern'].values

# Tokenize the test data
test_texts = list(X_test_c)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)
test_dataset_c = create_dataset(test_encodings, y_test_c, batch_size=64)

# Create and compile the final model for testing
final_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased')

# Compile the final model
final_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])

# Train the final model on the entire training set
final_train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
final_train_dataset = create_dataset(final_train_encodings, y_train_c, shuffle=True)
final_model.fit(final_train_dataset, epochs=5, callbacks=callbacks)

# Evaluate the final model on the test set
logits = final_model.predict(test_dataset_c)
y_preds_c = np.argmax(logits.logits, axis=1)

# Print classification report
print("DistilBERT Classification Report:")
print(classification_report(y_test_c, y_preds_c))

# Plot confusion matrix
cm_c = confusion_matrix(y_test_c, y_preds_c)
sns.heatmap(cm_c, annot=True, fmt='d', cmap='Blues')
plt.title('DistilBERT Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Identify false positives and false negatives
false_positives = np.where((y_preds_c == 1) & (y_test_c == 0))[0]
false_negatives = np.where((y_preds_c == 0) & (y_test_c == 1))[0]

print("\nFalse Positives:")
for idx in false_positives:
    print(test_texts[idx])

print("\nFalse Negatives:")
for idx in false_negatives:
    print(test_texts[idx])

Doubt model

In [None]:
# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-multilingual-cased')

# Function to create dataset
def create_dataset(encodings, labels, batch_size=16, shuffle=False):
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings), labels))
    if shuffle:
        dataset = dataset.shuffle(1000)
    dataset = dataset.batch(batch_size)
    return dataset

# Implementing K-Fold Cross-Validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Store results
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}")

    # Split data
    X_train_fold = X_train[train_idx]
    X_val_fold = X_train[val_idx]
    y_train_fold = y_train_q[train_idx]
    y_val_fold = y_train_q[val_idx]

    # Tokenize data
    train_encodings_fold = tokenizer(list(X_train_fold), truncation=True, padding=True, max_length=128)
    val_encodings_fold = tokenizer(list(X_val_fold), truncation=True, padding=True, max_length=128)

    # Create TensorFlow datasets
    train_dataset_fold = create_dataset(train_encodings_fold, y_train_fold, shuffle=True)
    val_dataset_fold = create_dataset(val_encodings_fold, y_val_fold, batch_size=64)

    # Reset the model for each fold
    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased')

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    # Add early stopping callback
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001, restore_best_weights=True)
    ]

    # Train the model
    history = model.fit(train_dataset_fold,
                        validation_data=val_dataset_fold,
                        epochs=5,
                        callbacks=callbacks)

    # Store metrics
    fold_metrics.append({
        'train_loss': history.history['loss'],
        'val_loss': history.history['val_loss'],
        'train_accuracy': history.history['accuracy'],
        'val_accuracy': history.history['val_accuracy']
    })

# Plotting function
def plot_metrics(metrics, metric_name):
    for fold, metric in enumerate(metrics):
        plt.plot(metric, label=f'Fold {fold + 1}')
    plt.title(f'{metric_name} per Fold')
    plt.xlabel('Epochs')
    plt.ylabel(metric_name)
    plt.legend()
    plt.show()

# Extract and plot the metrics
train_loss_per_fold = [m['train_loss'] for m in fold_metrics]
val_loss_per_fold = [m['val_loss'] for m in fold_metrics]
train_accuracy_per_fold = [m['train_accuracy'] for m in fold_metrics]
val_accuracy_per_fold = [m['val_accuracy'] for m in fold_metrics]

plot_metrics(train_loss_per_fold, 'Training Loss')
plot_metrics(val_loss_per_fold, 'Validation Loss')
plot_metrics(train_accuracy_per_fold, 'Training Accuracy')
plot_metrics(val_accuracy_per_fold, 'Validation Accuracy')

# Evaluation on the test set
X_test_d = np.stack(df_test_d['clean_text'].values)
y_test_d = df_test_d['is_doubt'].values

# Tokenize the test data
test_texts = list(X_test_d)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)
test_dataset_d = create_dataset(test_encodings, y_test_d, batch_size=64)

# Create and compile the final model for testing
final_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased')

# Compile the final model
final_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
                    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                    metrics=['accuracy'])

# Train the final model on the entire training set
final_train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
final_train_dataset = create_dataset(final_train_encodings, y_train_d, shuffle=True)
final_model.fit(final_train_dataset, epochs=5, callbacks=callbacks)

# Evaluate the final model on the test set
logits = final_model.predict(test_dataset_d)
y_preds_d = np.argmax(logits.logits, axis=1)

# Print classification report
print("DistilBERT Classification Report:")
print(classification_report(y_test_d, y_preds_d))

# Plot confusion matrix
cm_d = confusion_matrix(y_test_d, y_preds_d)
sns.heatmap(cm_d, annot=True, fmt='d', cmap='Blues')
plt.title('DistilBERT Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Identify false positives and false negatives
false_positives = np.where((y_preds_d == 1) & (y_test_d == 0))[0]
false_negatives = np.where((y_preds_d == 0) & (y_test_d == 1))[0]

print("\nFalse Positives:")
for idx in false_positives:
    print(test_texts[idx])

print("\nFalse Negatives:")
for idx in false_negatives:
    print(test_texts[idx])