In [3]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
import numpy as np

def parse_single_bnc_file(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".xml"):
            file_path = os.path.join(folder_path, filename)
            tree = ET.parse(file_path)
            root = tree.getroot()
            for u in root.findall('u'):
                sentence = []
                labels = []
                for w in u.findall('w'):
                    sentence.append(w.text)
                    labels.append(1 if w.attrib['pos'] in ['RR', 'UH'] else 0)
                data.append((' '.join(sentence), labels))
            break  
    return pd.DataFrame(data, columns=['sentence', 'labels'])

def train_logistic_regression(X_train, X_test, y_train, y_test):
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_vec, y_train)

    predictions = model.predict(X_test_vec)

    report = classification_report(y_test, predictions)

    important_features = sorted(zip(vectorizer.get_feature_names_out(), model.coef_[0]), key=lambda x: abs(x[1]), reverse=True)

    X_test = X_test.reset_index(drop=True) 
    y_test = y_test.reset_index(drop=True) 

    errors = [(X_test[i], y_test[i], predictions[i]) for i in range(len(X_test)) if y_test[i] != predictions[i]]

    return report, important_features, errors

def preprocess_and_train(folder_path, output_file):
    df = parse_single_bnc_file(folder_path)
    
    df['labels'] = df['labels'].apply(lambda x: 1 if 1 in x else 0) 
    X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['labels'], test_size=0.2, random_state=42)
    
    report, important_features, errors = train_logistic_regression(X_train, X_test, y_train, y_test)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"Classification Report:\n{report}\n")
        f.write(f"Top 10 Important Features (Semantics):\n{important_features[:10]}\n")
        f.write(f"Error Analysis (Mislabeled Sentences):\n")
        for error in errors:
            f.write(f"Sentence: {error[0]}\nActual: {error[1]}, Predicted: {error[2]}\n")

if __name__ == "__main__":
    preprocess_and_train('spoken/tagged', 'logreg2_results.txt')
