<a href="https://colab.research.google.com/github/mahb97/nlp-tagging-validation-joyce/blob/main/04_nlp_validation_joyce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Tagging Validation for Joyce's Dubliners

This notebook validates modern NLP POS tagging tools against expert CLAWS7 annotations for simile sentences from James Joyce's *Dubliners*.

## Research Objectives
- Compare accuracy of spaCy, NLTK, Flair, Stanza, TextBlob against CLAWS7 annotations
- Identify systematic tagging errors in literary text processing
- Analyze Joyce-specific linguistic challenges for computational tools

In [None]:
# Install and setup NLP libraries
print("Installing NLP libraries...")
!pip install -q spacy nltk flair stanza textblob scikit-learn plotly seaborn wordcloud

print("Downloading spaCy models...")
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

print("Downloading NLTK data...")
import nltk
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)

print("Setup complete!")

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
import spacy
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from flair.data import Sentence
from flair.models import SequenceTagger
from textblob import TextBlob
import stanza

# Analysis libraries
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import defaultdict, Counter

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("All libraries imported successfully!")

In [None]:
# Upload CSV file
from google.colab import files
print("Please upload your 'All Similes - Dubliners cont.csv' file:")
uploaded = files.upload()

# Load the data
csv_filename = list(uploaded.keys())[0]
df = pd.read_csv(csv_filename)

print(f"Loaded {len(df)} rows")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
display(df[['Sentences', 'CLAWS']].head())

In [None]:
# Process CLAWS data
def parse_claws_tags(claws_string):
    """Parse CLAWS7 format: 'word_TAG word_TAG ...'"""
    if pd.isna(claws_string) or not claws_string.strip():
        return [], []
    
    tokens = []
    tags = []
    
    for item in claws_string.strip().split():
        if '_' in item:
            parts = item.rsplit('_', 1)
            if len(parts) == 2:
                word, tag = parts
                tokens.append(word)
                tags.append(tag)
        else:
            tokens.append(item)
            tags.append('UNK')
    
    return tokens, tags

def validate_token_alignment(sentence, tokens):
    """Check if parsed tokens align with original sentence"""
    sentence_words = set(sentence.lower().split())
    token_words = set([t.lower() for t in tokens if t.isalpha()])
    
    if not token_words:
        return False
    
    overlap = len(sentence_words & token_words) / len(token_words)
    return overlap >= 0.7

# Process all sentences
processed_data = []
clean_df = df[['Sentences', 'CLAWS']].dropna()

print("Processing CLAWS7 tagged sentences...")

for idx, row in clean_df.iterrows():
    sentence = row['Sentences']
    claws_string = row['CLAWS']
    
    tokens, tags = parse_claws_tags(claws_string)
    
    if tokens and tags and validate_token_alignment(sentence, tokens):
        processed_data.append({
            'id': len(processed_data) + 1,
            'sentence': sentence,
            'tokens': tokens,
            'ground_truth_tags': tags,
            'token_count': len(tokens)
        })

print(f"Processed {len(processed_data)} valid sentences")

# Display sample
if processed_data:
    sample = processed_data[0]
    print(f"\nSample sentence: {sample['sentence'][:80]}...")
    print(f"Sample tags: {sample['ground_truth_tags'][:8]}...")
    print(f"Average sentence length: {np.mean([s['token_count'] for s in processed_data]):.1f} tokens")

In [None]:

print("Analysis will be implemented in this cell")
print("Copy the remaining code from the Python artifact here")