In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.utils import class_weight
import numpy as np
import matplotlib.pyplot as plt

def parse_bnc_file_word_level(folder_path, file_limit=a):
    data = []
    file_count = 0
    print(f"Parsing XML files in '{folder_path}'...")
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            print(f"Parsing {filename}...")
            tree = ET.parse(os.path.join(folder_path, filename))
            root = tree.getroot()
            for u in root.findall('u'):
                sentence = ' '.join([w.text for w in u.findall('w')])  # Create the full sentence
                for w in u.findall('w'):
                    text = w.text
                    label = 1 if w.attrib['pos'] in ['RR', 'UH'] else 0
                    data.append((text, label, w.attrib['pos'], sentence))  # Include sentence in data
            file_count += 1
            if file_count >= file_limit:
                break
    return pd.DataFrame(data, columns=['word', 'label', 'pos', 'sentence'])

def train_logistic_regression_word_level(X_train, X_test, y_train, y_test, df):
    vectorizer = CountVectorizer(ngram_range=(1, 2))  # Using bigrams
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = {i: weights[i] for i in range(len(weights))}

    model = LogisticRegression(max_iter=1000, class_weight=class_weights_dict)
    model.fit(X_train_vec, y_train)

    predictions = model.predict(X_test_vec)

    report = classification_report(y_test, predictions)
    auc_score = roc_auc_score(y_test, model.predict_proba(X_test_vec)[:, 1])
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test_vec)[:, 1])

    print("Model trained. Generating results...")

    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC curve (area = {auc_score:.2f})')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.savefig('logistic_regression_roc_curve.png')

    errors = []
    for i in range(len(predictions)):
        if predictions[i] != y_test.iloc[i]: 
            errors.append((X_test.iloc[i], y_test.iloc[i], predictions[i], df.iloc[X_test.index[i]]['sentence']))

    return report, auc_score, errors

def preprocess_and_train_word_level(folder_path, output_file):
    print("Preprocessing data for word-level classification...")
    df = parse_bnc_file_word_level(folder_path)

    X_train, X_test, y_train, y_test = train_test_split(df['word'], df['label'], test_size=0.2, random_state=42)

    report, auc_score, errors = train_logistic_regression_word_level(X_train, X_test, y_train, y_test, df)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"Classification Report:\n{report}\n")
        f.write(f"ROC AUC Score: {auc_score:.4f}\n")
        f.write(f"ROC curve saved as 'logistic_regression_roc_curve.png'\n")
        f.write("\nIncorrectly Classified Discourse Markers:\n")
        for word, true_label, pred_label, sentence in errors:
            f.write(f"Word: {word}, True Label: {true_label}, Predicted Label: {pred_label}, Sentence: '{sentence}'\n")

    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    preprocess_and_train_word_level('spoken/tagged', 'logreg_word_level_results.txt')


Map:   0%|          | 0/998488 [00:00<?, ? examples/s]

Map:   0%|          | 0/249622 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
