In [8]:
import os
import random
import pandas as pd
import re
import time
import nltk
import torch
import numpy as np
import spacy
from scipy.sparse import csr_matrix
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from torch.utils.data import Dataset
from transformers.trainer_utils import EvalPrediction

nltk.download('punkt')

try:
    spacy_model = spacy.load("de_core_news_sm")
except OSError:
    print("Downloading 'de_core_news_sm' model...")
    subprocess.run([sys.executable, "-m", "spacy", "download", "de_core_news_sm"], check=True)
    spacy_model = spacy.load("de_core_news_sm")
semantic_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')


[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/mabdelaal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:

# Define paths
ORIGINAL_FOLDER = "data/augmented_essays_data"
AUGMENTED_FOLDER = "data/Essays_dataset"
USE_AUGMENTED = False

In [13]:
def load_data_from_folders(original_folder, augmented_folder=None, use_augmented=True):
    file_data = []
    folders_to_read = [original_folder]
    if use_augmented and augmented_folder:
        folders_to_read.append(augmented_folder)

    for folder in folders_to_read:
        for filename in os.listdir(folder):
            if filename.endswith(".txt"):
                with open(os.path.join(folder, filename), "r", encoding="utf-8") as file:
                    lines = file.readlines()
                    for line in lines:
                        match = re.match(r'\d+\s+\[(.*?)\](\w+)', line)
                        if match:
                            text, label = match.groups()
                            file_data.append((text, label, filename))

    return pd.DataFrame(file_data, columns=['Text', 'Label', 'Filename'])

df = load_data_from_folders(ORIGINAL_FOLDER, AUGMENTED_FOLDER, USE_AUGMENTED)

In [14]:
label_counts = df['Label'].value_counts().sort_index()
print(label_counts)

Label
AHG    120
ATH     16
CON    696
FAZ    244
GLD    104
PRO    936
SON      8
TH1     32
TH2     24
WHG    276
ZTH     88
Name: count, dtype: int64


In [66]:
# ArgumentDataset class for XLM-Roberta
def label_to_id(labels):
    label_map = {label: idx for idx, label in enumerate(sorted(set(labels)))}
    return [label_map[label] for label in labels]

class ArgumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = label_to_id(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

In [67]:

class TextFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.Series):
            X = pd.Series(X)
        X = X.reset_index(drop=True)
        X_clean = X.fillna("").apply(str).str.strip()

        features = pd.DataFrame(index=X.index)
        features['word_count'] = X_clean.apply(lambda x: len(word_tokenize(x)) if x else 0)
        features['punctuation_count'] = X_clean.apply(lambda x: len(re.findall(r'[.,!?]', x)) if x else 0)
        features['sentiment'] = X_clean.apply(lambda x: TextBlob(x).sentiment.polarity if x else 0)

        pos_counts = []
        dep_counts = []

        for doc in spacy_model.pipe(X_clean.tolist(), disable=["ner"]):
            pos = doc.count_by(spacy.attrs.POS)
            dep = doc.count_by(spacy.attrs.DEP)
            pos_counts.append([
                pos.get(spacy.symbols.NOUN, 0),
                pos.get(spacy.symbols.VERB, 0),
                pos.get(spacy.symbols.ADJ, 0)
            ])
            dep_counts.append([
                dep.get(spacy.symbols.nsubj, 0),
                dep.get(spacy.symbols.dobj, 0)
            ])

        pos_df = pd.DataFrame(pos_counts, columns=['noun_count', 'verb_count', 'adj_count'], index=X.index)
        dep_df = pd.DataFrame(dep_counts, columns=['subj_count', 'obj_count'], index=X.index)

        features = pd.concat([features, pos_df, dep_df], axis=1)
        features = features.fillna(0)

        # Semantic embeddings
        embeddings = semantic_model.encode(X_clean.tolist(), show_progress_bar=False)
        return np.hstack((features.values, embeddings))




        

In [68]:
def evaluate_model(y_true, y_pred, label_names):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)

    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, target_names=label_names, zero_division=0))
    print(f"\nOverall Metrics:\nAccuracy: {acc:.4f}\nPrecision: {prec:.4f}\nRecall: {rec:.4f}\nF1 Score: {f1:.4f}")

def train_xlm_roberta():
    labels = df['Label'].astype('category')
    label_names = labels.cat.categories.tolist()
    y = labels.cat.codes

    tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    model = XLMRobertaForSequenceClassification.from_pretrained(
        "xlm-roberta-base",
        num_labels=len(label_names)
    )

    X_train, X_test, y_train, y_test = train_test_split(
        df['Text'].tolist(), y.tolist(), test_size=0.2, random_state=42
    )

    train_dataset = ArgumentDataset(X_train, y_train, tokenizer)
    test_dataset = ArgumentDataset(X_test, y_test, tokenizer)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    trainer.train()
    predictions = trainer.predict(test_dataset)
    y_pred = np.argmax(predictions.predictions, axis=1)
    evaluate_model(y_test, y_pred, label_names)

In [69]:
def train_classical_models():
    

    labels = df['Label'].astype('category')
    label_names = labels.cat.categories.tolist()
    y = labels.cat.codes
    X = df['Text']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    extractor = TextFeatures()
    X_train_feat = extractor.fit_transform(X_train)
    X_test_feat = extractor.transform(X_test)

    models = {
        'SVM': SVC(kernel='linear', class_weight='balanced'),
        'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
    }

    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train_feat, y_train)
        y_pred = model.predict(X_test_feat)
        print(f"{name} Performance:")
        evaluate_model(y_test, y_pred, label_names)

In [70]:
train_classical_models()
train_xlm_roberta()



Training SVM...
SVM Performance:

Classification Report:
              precision    recall  f1-score   support

         AHG       0.81      0.93      0.86        27
         ATH       0.43      1.00      0.60         3
         CON       0.67      0.67      0.67       165
         FAZ       0.74      0.72      0.73        69
         GLD       0.96      0.82      0.88        28
         PRO       0.74      0.72      0.73       216
         SON       1.00      1.00      1.00         3
         TH1       0.75      0.82      0.78        11
         TH2       0.86      0.86      0.86         7
         WHG       0.88      0.90      0.89        83
         ZTH       0.91      0.88      0.89        24

    accuracy                           0.76       636
   macro avg       0.79      0.85      0.81       636
weighted avg       0.76      0.76      0.76       636


Overall Metrics:
Accuracy: 0.7579
Precision: 0.7606
Recall: 0.7579
F1 Score: 0.7582

Training RandomForest...
RandomForest Perfo

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6692,1.821548
2,1.7901,1.815794
3,1.6611,1.810768



Classification Report:
              precision    recall  f1-score   support

         AHG       0.00      0.00      0.00        27
         ATH       0.00      0.00      0.00         3
         CON       0.00      0.00      0.00       165
         FAZ       0.00      0.00      0.00        69
         GLD       0.00      0.00      0.00        28
         PRO       0.34      1.00      0.51       216
         SON       0.00      0.00      0.00         3
         TH1       0.00      0.00      0.00        11
         TH2       0.00      0.00      0.00         7
         WHG       0.00      0.00      0.00        83
         ZTH       0.00      0.00      0.00        24

    accuracy                           0.34       636
   macro avg       0.03      0.09      0.05       636
weighted avg       0.12      0.34      0.17       636


Overall Metrics:
Accuracy: 0.3396
Precision: 0.1153
Recall: 0.3396
F1 Score: 0.1722


In [44]:
df['Label'].value_counts()

Label
PRO    936
CON    696
WHG    276
FAZ    244
AHG    120
GLD    104
ZTH     88
TH1     32
TH2     24
ATH     16
SON      8
Name: count, dtype: int64