In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np

def parse_bnc_folder(folder_path, file_limit=2):
    data = []
    file_count = 0
    print(f"Starting to parse XML files in '{folder_path}'")
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            print(f"Parsing file: {filename}...")
            tree = ET.parse(os.path.join(folder_path, filename))
            root = tree.getroot()
            for u in root.findall('u'):
                sentence = []
                labels = []
                for w in u.findall('w'):
                    sentence.append(w.text)
                    labels.append(1 if w.attrib['pos'] in ['RR', 'UH'] else 0)
                data.append((' '.join(sentence), labels))
            file_count += 1
            if file_count >= file_limit:
                print(f"Parsed {file_count} files. Stopping as per the file limit.")
                break  
    print("Finished parsing files.")
    return pd.DataFrame(data, columns=['sentence', 'labels'])

def preprocess_and_fine_tune_bert(folder_path, model_save_path):
    print("Starting preprocessing and fine-tuning of BERT...")
    
    df = parse_bnc_folder(folder_path, file_limit=2)
    
    df['labels'] = df['labels'].apply(lambda x: 1 if 1 in x else 0)

    print("Completed parsing and preprocessing. Splitting data into train and test sets...")
    
    X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['labels'], test_size=0.2, random_state=42)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def tokenize_function(examples):
        return tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=128)

    train_data = Dataset.from_pandas(pd.DataFrame({'sentence': X_train, 'labels': y_train}))
    test_data = Dataset.from_pandas(pd.DataFrame({'sentence': X_test, 'labels': y_test}))

    train_data = train_data.map(tokenize_function, batched=True)
    test_data = test_data.map(tokenize_function, batched=True)

    print("Data tokenization complete. Loading BERT model...")

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    training_args = TrainingArguments(
        output_dir=model_save_path,          
        evaluation_strategy="epoch",        
        per_device_train_batch_size=8,       
        per_device_eval_batch_size=8,       
        num_train_epochs=1,  # Reduced for testing
        save_steps=10_000,                   
        save_total_limit=2,                  
        logging_dir='./logs',
        fp16=True, 
        gradient_accumulation_steps=2,  #  larger batch size
    )

    trainer = Trainer(
        model=model,                         
        args=training_args,                  
        train_dataset=train_data,           
        eval_dataset=test_data               
    )

    print("Training BERT model...")
    trainer.train()

    print("Training complete. Evaluating model on test set...")
    
    predictions = trainer.predict(test_data)
    preds = np.argmax(predictions.predictions, axis=1)

    report = classification_report(y_test, preds)

    with open(f'{model_save_path}/classification_report.txt', 'w', encoding='utf-8') as f:
        f.write(report)
    
    print(f"Classification report saved to {model_save_path}/classification_report.txt")

    errors = []
    for idx, (pred, true) in enumerate(zip(preds, y_test)):
        if pred != true:
            sentence = X_test.iloc[idx]
            errors.append((sentence, true, pred))

    with open(f'{model_save_path}/errors.txt', 'w', encoding='utf-8') as f:
        f.write("Sentence, True Label, Predicted Label\n")
        for sentence, true_label, pred_label in errors:
            f.write(f"'{sentence}', {true_label}, {pred_label}\n")

    print(f"Errors saved to {model_save_path}/errors.txt")

if __name__ == "__main__":
    preprocess_and_fine_tune_bert('spoken/tagged', 'fine_tuned_bert_model')


Starting preprocessing and fine-tuning of BERT...
Starting to parse XML files in 'spoken/tagged'
Parsing file: S23A-tgd.xml...
Parsing file: S24A-tgd.xml...
Parsed 2 files. Stopping as per the file limit.
Finished parsing files.
Completed parsing and preprocessing. Splitting data into train and test sets...


Map:   0%|          | 0/3334 [00:00<?, ? examples/s]

Map:   0%|          | 0/834 [00:00<?, ? examples/s]

Data tokenization complete. Loading BERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training BERT model...


Epoch,Training Loss,Validation Loss
0,No log,0.185048


Training complete. Evaluating model on test set...


Classification report saved to fine_tuned_bert_model/classification_report.txt
Errors saved to fine_tuned_bert_model/errors.txt
