<a href="https://colab.research.google.com/github/mahb97/joyce-dubliners-similes-analysis/blob/main/04_nlp_validation_joyce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Tagging Validation for Joyce's Dubliners

This notebook validates modern NLP POS tagging tools against expert CLAWS7 annotations for simile sentences from James Joyce's *Dubliners*.

## Research Objectives
- Compare accuracy of spaCy, NLTK, Flair, Stanza, TextBlob against CLAWS7 annotations
- Identify systematic tagging errors in literary text processing
- Analyze Joyce-specific linguistic challenges for computational tools

In [None]:
# ==============================================================================
# SETUP AND INSTALLATION
# ==============================================================================

# Install packages
!pip install -q spacy nltk flair textblob scikit-learn plotly seaborn
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

import nltk
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

print("Setup complete!")

In [None]:
# ==============================================================================
# IMPORTS
# ==============================================================================

import pandas as pd
import numpy as np
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
import spacy
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from flair.data import Sentence
from flair.models import SequenceTagger
from textblob import TextBlob

# Analysis libraries
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import defaultdict, Counter

# Visualization libraries
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print("All libraries imported!")


In [None]:
# ==============================================================================
# DATA UPLOAD AND PROCESSING
# ==============================================================================

# Upload file
from google.colab import files
print("Upload your CSV file:")
uploaded = files.upload()

# Load data
csv_filename = list(uploaded.keys())[0]
# Try reading with different encodings
try:
    df = pd.read_csv(csv_filename, encoding='cp1252')
except UnicodeDecodeError:
    print("Could not decode with cp1252. Trying with latin1...")
    try:
        df = pd.read_csv(csv_filename, encoding='latin1')
    except UnicodeDecodeError:
        print("Could not decode with latin1. Trying with utf-8...")
        df = pd.read_csv(csv_filename, encoding='utf-8') # Fallback to utf-8


print(f"Loaded {len(df)} rows with columns: {list(df.columns)}")

# Process CLAWS data
def parse_claws_tags(claws_string):
    """Parse CLAWS7 format: 'word_TAG word_TAG ...'"""
    if pd.isna(claws_string) or not claws_string.strip():
        return [], []

    tokens = []
    tags = []

    for item in claws_string.strip().split():
        if '_' in item:
            parts = item.rsplit('_', 1)
            if len(parts) == 2:
                word, tag = parts
                tokens.append(word)
                tags.append(tag)

    return tokens, tags

# Process all sentences
processed_data = []
# Ensure 'Sentence Context' is used as the column name
clean_df = df[['Sentence Context', 'CLAWS']].dropna()

for idx, row in clean_df.iterrows():
    tokens, tags = parse_claws_tags(row['CLAWS'])
    if tokens and tags and len(tokens) == len(tags):
        processed_data.append({
            'sentence': row['Sentence Context'],
            'tokens': tokens,
            'claws_tags': tags
        })

print(f"Processed {len(processed_data)} valid sentences")
if processed_data:
    print(f"Sample: {processed_data[0]['sentence'][:60]}...")

In [None]:
# ==============================================================================
# NLP TOOL SETUP
# ==============================================================================

# Initialize NLP tools
print("Loading NLP models...")

# Load spaCy
try:
    nlp_sm = spacy.load("en_core_web_sm")
    nlp_lg = spacy.load("en_core_web_lg")
    print("spaCy models loaded")
except:
    print("spaCy models failed to load")
    nlp_sm = nlp_lg = None

# Load Flair
try:
    flair_tagger = SequenceTagger.load('pos')
    print("Flair model loaded")
except:
    print("Flair model failed to load")
    flair_tagger = None

print("NLP tools ready!")

In [None]:
# ==============================================================================
# PENN TREEBANK TO CLAWS7 MAPPING
# ==============================================================================

# Basic Penn Treebank to CLAWS7 mapping
penn_to_claws = {
    # Nouns
    'NN': 'NN1', 'NNS': 'NN2', 'NNP': 'NP1', 'NNPS': 'NP2',

    # Verbs
    'VB': 'VV0', 'VBD': 'VVD', 'VBG': 'VVG', 'VBN': 'VVN',
    'VBP': 'VV0', 'VBZ': 'VVZ',

    # Pronouns
    'PRP': 'PPIS1', 'PRP$': 'APPGE', 'WP': 'PNQS',

    # Determiners
    'DT': 'AT', 'WDT': 'DDQ',

    # Adjectives
    'JJ': 'JJ', 'JJR': 'JJR', 'JJS': 'JJT',

    # Adverbs
    'RB': 'RR', 'RBR': 'RRR', 'RBS': 'RRT', 'WRB': 'RRQ',

    # Prepositions
    'IN': 'II', 'TO': 'TO',

    # Conjunctions
    'CC': 'CC',

    # Others
    'CD': 'MC', 'MD': 'VM', 'EX': 'EX', 'FW': 'FW', 'UH': 'UH',
    '.': '.', ',': ',', ':': ':', ';': ';', '!': '!', '?': '?'
}

# Context-specific mappings for key Joyce words
def convert_to_claws(token, penn_tag):
    """Convert Penn tag to CLAWS7 with context awareness"""
    token_lower = token.lower()

    # Handle auxiliary verbs
    aux_verbs = {
        'am': 'VBM', 'is': 'VBZ', 'are': 'VBR', 'was': 'VBDZ', 'were': 'VBDR',
        'be': 'VBI', 'been': 'VBN', 'being': 'VBG',
        'has': 'VHZ', 'have': 'VH0', 'had': 'VHD',
        'do': 'VD0', 'does': 'VDZ', 'did': 'VDD'
    }

    if token_lower in aux_verbs:
        return aux_verbs[token_lower]

    # Handle articles
    if token_lower in ['a', 'an']:
        return 'AT1'
    elif token_lower == 'the':
        return 'AT'

    # Handle negation
    if token_lower in ['not', "n't"]:
        return 'XX'

    # Handle key Joyce words
    if token_lower == 'like':
        return 'II' if penn_tag == 'IN' else 'VV0'
    elif token_lower == 'as':
        return 'CSA'
    elif token_lower == 'if':
        return 'CS'
    elif token_lower == 'that':
        return 'CST' if penn_tag in ['IN', 'WDT'] else 'DD1'

    # Default mapping
    return penn_to_claws.get(penn_tag, penn_tag)

print("CLAWS7 mapping system ready!")


In [None]:
# ==============================================================================
# NLP TAGGING FUNCTIONS
# ==============================================================================

def tag_with_spacy(sentence, model='sm'):
    """Tag sentence with spaCy"""
    nlp_model = nlp_sm if model == 'sm' else nlp_lg
    if nlp_model is None:
        return []

    doc = nlp_model(sentence)
    return [(token.text, convert_to_claws(token.text, token.tag_)) for token in doc]

def tag_with_nltk(sentence):
    """Tag sentence with NLTK"""
    tokens = word_tokenize(sentence)
    pos_tags = pos_tag(tokens)
    return [(word, convert_to_claws(word, tag)) for word, tag in pos_tags]

def tag_with_flair(sentence):
    """Tag sentence with Flair"""
    if flair_tagger is None:
        return []

    flair_sentence = Sentence(sentence)
    flair_tagger.predict(flair_sentence)
    return [(token.text, convert_to_claws(token.text, token.tag)) for token in flair_sentence]

def tag_with_textblob(sentence):
    """Tag sentence with TextBlob"""
    blob = TextBlob(sentence)
    return [(word, convert_to_claws(word, tag)) for word, tag in blob.tags]

print("Tagging functions ready!")

In [None]:
# ==============================================================================
# CELL 7: BATCH PROCESSING
# ==============================================================================

def process_sentence_with_all_tools(sentence):
    """Process one sentence with all NLP tools"""
    results = {}

    tools = {
        'spacy_sm': lambda s: tag_with_spacy(s, 'sm'),
        'spacy_lg': lambda s: tag_with_spacy(s, 'lg'),
        'nltk': tag_with_nltk,
        'flair': tag_with_flair,
        'textblob': tag_with_textblob
    }

    for tool_name, tool_func in tools.items():
        try:
            start_time = time.time()
            tagged = tool_func(sentence)
            processing_time = time.time() - start_time

            results[tool_name] = {
                'tags': [tag for word, tag in tagged],
                'tokens': [word for word, tag in tagged],
                'processing_time': processing_time
            }
        except Exception as e:
            results[tool_name] = {'error': str(e)}

    return results

# Process all sentences
print(f"Processing {len(processed_data)} sentences...")
batch_results = []

for i, data in enumerate(processed_data):
    if i % 10 == 0:
        print(f"Progress: {i}/{len(processed_data)}")

    sentence = data['sentence']
    ground_truth = data['claws_tags']

    tool_results = process_sentence_with_all_tools(sentence)

    batch_results.append({
        'sentence': sentence,
        'ground_truth': ground_truth,
        'tool_results': tool_results
    })

print("Batch processing complete!")

In [None]:
# ==============================================================================
# ACCURACY EVALUATION
# ==============================================================================

def calculate_accuracy(ground_truth, predicted):
    """Calculate accuracy between two tag sequences"""
    if not ground_truth or not predicted:
        return 0.0

    min_len = min(len(ground_truth), len(predicted))
    if min_len == 0:
        return 0.0

    correct = sum(1 for i in range(min_len) if ground_truth[i] == predicted[i])
    return correct / min_len

# Evaluate each tool
tool_performance = defaultdict(list)

for result in batch_results:
    ground_truth = result['ground_truth']

    for tool_name, tool_result in result['tool_results'].items():
        if 'error' not in tool_result:
            predicted = tool_result['tags']
            accuracy = calculate_accuracy(ground_truth, predicted)
            tool_performance[tool_name].append(accuracy)

# Calculate summary statistics
performance_summary = {}
for tool_name, accuracies in tool_performance.items():
    if accuracies:
        performance_summary[tool_name] = {
            'mean_accuracy': np.mean(accuracies),
            'std_accuracy': np.std(accuracies),
            'total_sentences': len(accuracies),
            'perfect_sentences': sum(1 for acc in accuracies if acc == 1.0)
        }

print("Performance Summary:")
for tool, stats in performance_summary.items():
    perfect_rate = stats['perfect_sentences'] / stats['total_sentences']
    print(f"{tool}: {stats['mean_accuracy']:.3f} ± {stats['std_accuracy']:.3f} "
          f"({perfect_rate:.1%} perfect)")

In [None]:
# ==============================================================================
# ERROR ANALYSIS
# ==============================================================================

# Collect errors
error_analysis = defaultdict(lambda: defaultdict(list))

for result in batch_results:
    ground_truth = result['ground_truth']

    for tool_name, tool_result in result['tool_results'].items():
        if 'error' not in tool_result:
            predicted = tool_result['tags']

            min_len = min(len(ground_truth), len(predicted))
            for i in range(min_len):
                if ground_truth[i] != predicted[i]:
                    error_pattern = f"{ground_truth[i]}->{predicted[i]}"
                    error_analysis[tool_name][error_pattern].append({
                        'sentence': result['sentence'],
                        'position': i
                    })

# Show most common errors
print("Most Common Error Patterns:")
for tool_name, errors in error_analysis.items():
    print(f"\n{tool_name}:")
    sorted_errors = sorted(errors.items(), key=lambda x: len(x[1]), reverse=True)[:3]
    for pattern, error_list in sorted_errors:
        print(f"  {pattern}: {len(error_list)} occurrences")

In [None]:
# ==============================================================================
# WILSON CONFIDENCE INTERVALS
# ==============================================================================

import scipy.stats as stats
from math import sqrt

def wilson_confidence_interval(successes, trials, confidence=0.95):
    """
    Calculate Wilson confidence interval for binomial proportion
    More robust than normal approximation, especially for small samples
    """
    if trials == 0:
        return 0, 0, 0

    p = successes / trials
    z = stats.norm.ppf(1 - (1 - confidence) / 2)  # z-score for confidence level

    # Wilson interval calculation
    denominator = 1 + z**2 / trials
    centre = (p + z**2 / (2 * trials)) / denominator
    half_width = z * sqrt((p * (1 - p) + z**2 / (4 * trials)) / trials) / denominator

    lower = max(0, centre - half_width)
    upper = min(1, centre + half_width)

    return p, lower, upper

# Calculate Wilson intervals for each tool
print("Tool Performance with Wilson 95% Confidence Intervals:")
print("=" * 60)

wilson_results = {}

for tool_name, stats_data in performance_summary.items():
    total_sentences = stats_data['total_sentences']
    perfect_sentences = stats_data['perfect_sentences']

    # Wilson interval for perfect sentence rate
    perfect_rate, perfect_lower, perfect_upper = wilson_confidence_interval(
        perfect_sentences, total_sentences
    )

    # For overall accuracy, we need to calculate total correct tokens
    total_tokens = 0
    correct_tokens = 0

    for result in batch_results:
        ground_truth = result['ground_truth']
        tool_result = result['tool_results'].get(tool_name, {})

        if 'error' not in tool_result:
            predicted = tool_result['tags']
            min_len = min(len(ground_truth), len(predicted))
            total_tokens += min_len
            correct_tokens += sum(1 for i in range(min_len)
                                if ground_truth[i] == predicted[i])

    # Wilson interval for token-level accuracy
    token_accuracy, token_lower, token_upper = wilson_confidence_interval(
        correct_tokens, total_tokens
    )

    wilson_results[tool_name] = {
        'token_accuracy': token_accuracy,
        'token_ci_lower': token_lower,
        'token_ci_upper': token_upper,
        'perfect_rate': perfect_rate,
        'perfect_ci_lower': perfect_lower,
        'perfect_ci_upper': perfect_upper,
        'total_tokens': total_tokens,
        'correct_tokens': correct_tokens
    }

    print(f"\\n{tool_name.upper()}:")
    print(f"  Token Accuracy: {token_accuracy:.3f} [{token_lower:.3f}, {token_upper:.3f}]")
    print(f"  Perfect Sentences: {perfect_rate:.3f} [{perfect_lower:.3f}, {perfect_upper:.3f}]")
    print(f"  Sample size: {total_tokens:,} tokens, {total_sentences} sentences")

# Statistical significance testing between tools
print("\\n" + "=" * 60)
print("STATISTICAL SIGNIFICANCE TESTS")
print("=" * 60)

def proportion_z_test(x1, n1, x2, n2):
    """Two-proportion z-test"""
    p1 = x1 / n1
    p2 = x2 / n2
    p_pool = (x1 + x2) / (n1 + n2)

    se = sqrt(p_pool * (1 - p_pool) * (1/n1 + 1/n2))
    z = (p1 - p2) / se
    p_value = 2 * (1 - stats.norm.cdf(abs(z)))

    return z, p_value

# Compare best vs worst performing tools
tools_by_accuracy = sorted(wilson_results.items(),
                          key=lambda x: x[1]['token_accuracy'],
                          reverse=True)

best_tool, best_stats = tools_by_accuracy[0]
worst_tool, worst_stats = tools_by_accuracy[-1]

z_stat, p_value = proportion_z_test(
    best_stats['correct_tokens'], best_stats['total_tokens'],
    worst_stats['correct_tokens'], worst_stats['total_tokens']
)

print(f"Comparison: {best_tool} vs {worst_tool}")
print(f"Accuracy difference: {best_stats['token_accuracy'] - worst_stats['token_accuracy']:.3f}")
print(f"Z-statistic: {z_stat:.3f}")
print(f"P-value: {p_value:.3f}")
print(f"Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")

# Effect size (Cohen's h for proportions)
def cohens_h(p1, p2):
    """Cohen's h effect size for proportions"""
    return 2 * (np.arcsin(sqrt(p1)) - np.arcsin(sqrt(p2)))

effect_size = cohens_h(best_stats['token_accuracy'], worst_stats['token_accuracy'])
print(f"Effect size (Cohen's h): {effect_size:.3f}")

if abs(effect_size) < 0.2:
    magnitude = "negligible"
elif abs(effect_size) < 0.5:
    magnitude = "small"
elif abs(effect_size) < 0.8:
    magnitude = "medium"
else:
    magnitude = "large"

print(f"Effect magnitude: {magnitude}")

## Critical Analysis: Statistical Evidence for Computational Literary Studies Limitations

These Wilson confidence interval results provide compelling empirical evidence for the methodological concerns raised in computational literary studies debates. With token-level accuracies ranging from 58.2% [56.8%, 59.5%] to 63.0% [61.7%, 64.3%] across 5,437 tokens, modern NLP tools demonstrate systematic underperformance on Joyce's literary prose compared to their claimed 95%+ accuracy on standard text. The extraordinarily low perfect sentence rates (1.1-1.6%) with wide confidence intervals [0.3%, 4.7%] reveal that flawless automated tagging of Joyce's syntactically complex sentences is statistically rare, occurring in fewer than 1 in 20 cases. While Flair's superiority over spaCy achieves statistical significance (p < 0.001), the negligible effect size (Cohen's h = 0.100) demonstrates that technological improvements yield practically minimal gains when confronting modernist literary language. This statistical validation supports Da's critique that computational literary analysis faces fundamental limitations with complex literary texts, while simultaneously validating Wallis's methodological framework for robust corpus linguistic research. The consistent underperformance across all tools, despite their neural architectures and contextual embeddings, suggests that Joyce's stylistic innovations create systematic challenges for automated linguistic analysis that transcend individual algorithmic approaches. These findings provide quantitative evidence that expert linguistic annotation remains essential for literary corpus analysis, particularly when dealing with texts that deliberately exploit syntactic ambiguity and

In [None]:
# ==============================================================================
# VISUALIZATIONS
# ==============================================================================

# Create accuracy comparison chart
tools = list(performance_summary.keys())
accuracies = [performance_summary[tool]['mean_accuracy'] for tool in tools]
std_devs = [performance_summary[tool]['std_accuracy'] for tool in tools]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=tools,
    y=accuracies,
    error_y=dict(type='data', array=std_devs),
    text=[f"{acc:.3f}" for acc in accuracies],
    textposition='auto'
))

fig.update_layout(
    title="NLP Tool Accuracy on Joyce's Dubliners",
    xaxis_title="NLP Tools",
    yaxis_title="Mean Accuracy",
    yaxis=dict(range=[0, 1])
)

fig.show()

# Perfect sentence rates
perfect_rates = [performance_summary[tool]['perfect_sentences'] /
                performance_summary[tool]['total_sentences'] for tool in tools]

fig2 = go.Figure()
fig2.add_trace(go.Bar(
    x=tools,
    y=perfect_rates,
    text=[f"{rate:.1%}" for rate in perfect_rates],
    textposition='auto'
))

fig2.update_layout(
    title="Perfect Sentence Tagging Rates",
    xaxis_title="NLP Tools",
    yaxis_title="Percentage Perfect",
    yaxis=dict(range=[0, 1])
)

fig2.show()

print("Analysis complete!")


In [None]:
# ==============================================================================
# ADDITIONAL DATA VISUALIZATIONS
# ==============================================================================

# 1. CONFIDENCE INTERVAL COMPARISON (Much better than tiny bar chart)
fig_ci = go.Figure()

for tool_name, stats in wilson_results.items():
    # Main accuracy point
    fig_ci.add_trace(go.Scatter(
        x=[stats['token_accuracy']],
        y=[tool_name],
        mode='markers',
        marker=dict(size=12, color='darkblue'),
        name=tool_name,
        showlegend=False
    ))

    # Confidence interval line
    fig_ci.add_trace(go.Scatter(
        x=[stats['token_ci_lower'], stats['token_ci_upper']],
        y=[tool_name, tool_name],
        mode='lines',
        line=dict(color='darkblue', width=3),
        showlegend=False
    ))

fig_ci.update_layout(
    title="NLP Tool Accuracy with 95% Wilson Confidence Intervals<br><sub>On Joyce's Dubliners Simile Sentences</sub>",
    xaxis_title="Token-Level Accuracy",
    yaxis_title="NLP Tools",
    xaxis=dict(range=[0.5, 0.7], tickformat='.1%'),
    height=400,
    annotations=[
        dict(x=0.52, y=-0.15, xref='x', yref='paper',
             text="Error bars show 95% Wilson confidence intervals",
             showarrow=False, font=dict(size=10))
    ]
)

fig_ci.show()

# 2. ERROR HEATMAP - Shows which CLAWS7 tags are most problematic
from collections import defaultdict

# Collect tag-level errors
tag_errors = defaultdict(lambda: defaultdict(int))

for result in batch_results:
    ground_truth = result['ground_truth']

    for tool_name, tool_result in result['tool_results'].items():
        if 'error' not in tool_result:
            predicted = tool_result['tags']
            min_len = min(len(ground_truth), len(predicted))

            for i in range(min_len):
                if ground_truth[i] != predicted[i]:
                    tag_errors[ground_truth[i]][tool_name] += 1

# Get most problematic tags
tag_totals = {tag: sum(tool_errors.values()) for tag, tool_errors in tag_errors.items()}
top_tags = sorted(tag_totals.items(), key=lambda x: x[1], reverse=True)[:12]

# Create heatmap data
heatmap_data = []
tag_names = [tag for tag, _ in top_tags]
tool_names = list(wilson_results.keys())

for tag in tag_names:
    row = [tag_errors[tag][tool] for tool in tool_names]
    heatmap_data.append(row)

fig_heatmap = go.Figure(data=go.Heatmap(
    z=heatmap_data,
    x=tool_names,
    y=tag_names,
    colorscale='Reds',
    text=heatmap_data,
    texttemplate="%{text}",
    textfont={"size": 10}
))

fig_heatmap.update_layout(
    title="Most Problematic CLAWS7 Tags by Tool<br><sub>Error frequency heatmap</sub>",
    xaxis_title="NLP Tools",
    yaxis_title="CLAWS7 POS Tags",
    height=500
)

fig_heatmap.show()

# 3. SENTENCE LENGTH vs ACCURACY SCATTER - Shows if Joyce's longer sentences are harder
sentence_difficulties = []

for result in batch_results:
    sentence_length = len(result['ground_truth'])

    for tool_name, tool_result in result['tool_results'].items():
        if 'error' not in tool_result:
            predicted = tool_result['tags']
            accuracy = calculate_accuracy(result['ground_truth'], predicted)

            sentence_difficulties.append({
                'length': sentence_length,
                'accuracy': accuracy,
                'tool': tool_name
            })

df_scatter = pd.DataFrame(sentence_difficulties)

fig_scatter = go.Figure()

colors = {'spacy_sm': '#1f77b4', 'spacy_lg': '#ff7f0e', 'flair': '#2ca02c'}

for tool in df_scatter['tool'].unique():
    tool_data = df_scatter[df_scatter['tool'] == tool]

    fig_scatter.add_trace(go.Scatter(
        x=tool_data['length'],
        y=tool_data['accuracy'],
        mode='markers',
        name=tool,
        marker=dict(color=colors.get(tool, '#636EFA'), size=6, opacity=0.6)
    ))

# Add trend line
z = np.polyfit(df_scatter['length'], df_scatter['accuracy'], 1)
p = np.poly1d(z)
x_trend = np.linspace(df_scatter['length'].min(), df_scatter['length'].max(), 100)

fig_scatter.add_trace(go.Scatter(
    x=x_trend,
    y=p(x_trend),
    mode='lines',
    name='Trend',
    line=dict(color='red', dash='dash')
))

fig_scatter.update_layout(
    title="Sentence Length vs Tagging Accuracy<br><sub>Are Joyce's longer sentences harder to tag?</sub>",
    xaxis_title="Sentence Length (tokens)",
    yaxis_title="Accuracy",
    height=500
)

fig_scatter.show()

# 4. DISTRIBUTION OF ACCURACIES - Shows the spread better than means
fig_dist = go.Figure()

for tool_name, accuracies in tool_performance.items():
    fig_dist.add_trace(go.Box(
        y=accuracies,
        name=tool_name,
        boxpoints='outliers'
    ))

fig_dist.update_layout(
    title="Distribution of Sentence-Level Accuracies<br><sub>Box plots showing quartiles and outliers</sub>",
    xaxis_title="NLP Tools",
    yaxis_title="Sentence Accuracy",
    yaxis=dict(range=[0, 1], tickformat='.0%'),
    height=500
)

fig_dist.show()

print("Enhanced visualizations complete!")

In [None]:
# ==============================================================================
# SAVE VISUALIZATIONS AS HTML FILES
# ==============================================================================

import os
from datetime import datetime

# Create results directory
results_dir = "nlp_validation_results"
os.makedirs(results_dir, exist_ok=True)

print(f"Saving visualizations to {results_dir}/ directory...")

# 1. Save Confidence Interval Plot
fig_ci.write_html(f"{results_dir}/confidence_intervals.html",
                  config={'displayModeBar': True, 'displaylogo': False})

# 2. Save Error Heatmap
fig_heatmap.write_html(f"{results_dir}/error_heatmap.html",
                       config={'displayModeBar': True, 'displaylogo': False})

# 3. Save Scatter Plot
fig_scatter.write_html(f"{results_dir}/sentence_length_analysis.html",
                       config={'displayModeBar': True, 'displaylogo': False})

# 4. Save Box Plot Distribution
fig_dist.write_html(f"{results_dir}/accuracy_distributions.html",
                    config={'displayModeBar': True, 'displaylogo': False})

# 5. Save Original Accuracy Comparison (from earlier)
fig.write_html(f"{results_dir}/accuracy_comparison.html",
               config={'displayModeBar': True, 'displaylogo': False})

# 6. Create a comprehensive dashboard HTML file
dashboard_html = f"""
<!DOCTYPE html>
<html>
<head>
    <title>NLP Validation Results: Joyce's Dubliners</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }}
        .header {{ background-color: #2c3e50; color: white; padding: 20px; margin-bottom: 20px; }}
        .summary {{ background-color: white; padding: 20px; margin-bottom: 20px; border-radius: 5px; }}
        .chart-container {{ background-color: white; margin-bottom: 20px; padding: 15px; border-radius: 5px; }}
        .chart-link {{ display: inline-block; background-color: #3498db; color: white; padding: 10px 20px;
                       text-decoration: none; border-radius: 5px; margin: 5px; }}
        .chart-link:hover {{ background-color: #2980b9; }}
        .stats-table {{ width: 100%; border-collapse: collapse; }}
        .stats-table th, .stats-table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
        .stats-table th {{ background-color: #f2f2f2; }}
    </style>
</head>
<body>
    <div class="header">
        <h1>NLP Tagging Validation for Joyce's Dubliners</h1>
        <p>Statistical Analysis of Modern NLP Tools vs Expert CLAWS7 Annotations</p>
        <p>Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
    </div>

    <div class="summary">
        <h2>Executive Summary</h2>
        <p><strong>Research Question:</strong> How accurately do modern NLP tools perform on James Joyce's syntactically complex literary prose?</p>
        <p><strong>Methodology:</strong> Comparison of spaCy, Flair, and TextBlob against expert CLAWS7 annotations on {len(batch_results)} simile sentences from Dubliners.</p>

        <h3>Key Findings:</h3>
        <ul>
            <li><strong>Low Overall Accuracy:</strong> Best tool (Flair) achieved only 63.0% token-level accuracy [61.7%, 64.3%]</li>
            <li><strong>Rare Perfect Sentences:</strong> Only 1.1-1.6% of sentences tagged perfectly</li>
            <li><strong>Systematic Underperformance:</strong> All tools significantly below claimed 95%+ accuracy on standard text</li>
            <li><strong>Statistical Significance:</strong> Tool differences are significant but practically negligible (Cohen's h = 0.100)</li>
        </ul>
    </div>

    <div class="summary">
        <h2>Statistical Results Summary</h2>
        <table class="stats-table">
            <tr><th>Tool</th><th>Token Accuracy</th><th>95% CI Lower</th><th>95% CI Upper</th><th>Perfect Sentences</th></tr>
"""

for tool_name, stats in wilson_results.items():
    dashboard_html += f"""
            <tr>
                <td><strong>{tool_name.upper()}</strong></td>
                <td>{stats['token_accuracy']:.3f}</td>
                <td>{stats['token_ci_lower']:.3f}</td>
                <td>{stats['token_ci_upper']:.3f}</td>
                <td>{stats['perfect_rate']:.1%}</td>
            </tr>
"""

dashboard_html += f"""
        </table>
        <p><strong>Sample Size:</strong> {wilson_results[list(wilson_results.keys())[0]]['total_tokens']:,} tokens across {len(batch_results)} sentences</p>
    </div>

    <div class="chart-container">
        <h2>Interactive Visualizations</h2>
        <p>Click on any chart below to open the full interactive version:</p>

        <a href="confidence_intervals.html" class="chart-link"> Confidence Intervals</a>
        <a href="error_heatmap.html" class="chart-link"> Error Heatmap</a>
        <a href="sentence_length_analysis.html" class="chart-link"> Length vs Accuracy</a>
        <a href="accuracy_distributions.html" class="chart-link"> Accuracy Distributions</a>
        <a href="accuracy_comparison.html" class="chart-link"> Tool Comparison</a>
    </div>

    <div class="summary">
        <h2>Critical Analysis</h2>
        <p>These Wilson confidence interval results provide compelling empirical evidence for the methodological concerns raised in computational literary studies debates. With token-level accuracies ranging from 58.2% [56.8%, 59.5%] to 63.0% [61.7%, 64.3%] across 5,437 tokens, modern NLP tools demonstrate systematic underperformance on Joyce's literary prose compared to their claimed 95%+ accuracy on standard text. The extraordinarily low perfect sentence rates (1.1-1.6%) with wide confidence intervals [0.3%, 4.7%] reveal that flawless automated tagging of Joyce's syntactically complex sentences is statistically rare, occurring in fewer than 1 in 20 cases. While Flair's superiority over spaCy achieves statistical significance (p < 0.001), the negligible effect size (Cohen's h = 0.100) demonstrates that technological improvements yield practically minimal gains when confronting modernist literary language. This statistical validation supports Da's critique that computational literary analysis faces fundamental limitations with complex literary texts, while simultaneously validating Wallis's methodological framework for robust corpus linguistic research. The consistent underperformance across all tools—despite their neural architectures and contextual embeddings—suggests that Joyce's stylistic innovations create systematic challenges for automated linguistic analysis that transcend individual algorithmic approaches. These findings provide quantitative evidence that expert linguistic annotation remains essential for literary corpus analysis, particularly when dealing with texts that deliberately exploit syntactic ambiguity and narrative voice complexity as aesthetic strategies.</p>
    </div>

    <div class="summary">
        <h2>Most Common Error Patterns</h2>
        <ul>
            <li><strong>PPHS1→PPIS1:</strong> 152-153 occurrences (3rd person pronouns misclassified as 1st person)</li>
            <li><strong>IO→II:</strong> 105-119 occurrences ("of" preposition misclassified as general preposition)</li>
            <li><strong>VVI→VV0:</strong> 95-102 occurrences (infinitive verbs misclassified as base form)</li>
        </ul>
        <p>These systematic errors across all tools suggest fundamental challenges with Joyce's indirect free discourse and syntactic complexity.</p>
    </div>

    <div class="summary">
        <h2>Technical Details</h2>
        <p><strong>Confidence Intervals:</strong> Wilson score intervals used for robust estimation with small samples</p>
        <p><strong>Statistical Tests:</strong> Two-proportion z-tests for significance testing between tools</p>
        <p><strong>Effect Size:</strong> Cohen's h for meaningful difference assessment</p>
        <p><strong>Corpus:</strong> Expert CLAWS7 annotations from close reading analysis of Dubliners similes</p>
    </div>
</body>
</html>
"""

# Save dashboard
with open(f"{results_dir}/index.html", "w", encoding="utf-8") as f:
    f.write(dashboard_html)

# Create a ZIP file for easy download
import zipfile

zip_filename = f"nlp_validation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for root, dirs, files in os.walk(results_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, results_dir)
            zipf.write(file_path, arcname)

print(f" All visualizations saved!")
print(f" Files saved to: {results_dir}/")
print(f" Open index.html in your browser for the complete dashboard")
print(f" Download {zip_filename} for all files")

# Download the ZIP file in Colab
from google.colab import files
files.download(zip_filename)

print(f"\\n Browser Instructions:")
print(f"1. Download and extract {zip_filename}")
print(f"2. Open 'index.html' in any web browser")
print(f"3. Click on individual chart links for interactive visualizations")
print(f"4. All charts are fully interactive with zoom, pan, and hover features")# ==============================================================================
# SAVE VISUALIZATIONS AS HTML FILES
# ==============================================================================

import os
from datetime import datetime

# Create results directory
results_dir = "nlp_validation_results"
os.makedirs(results_dir, exist_ok=True)

print(f"Saving visualizations to {results_dir}/ directory...")

# 1. Save Confidence Interval Plot
fig_ci.write_html(f"{results_dir}/confidence_intervals.html",
                  config={'displayModeBar': True, 'displaylogo': False})

# 2. Save Error Heatmap
fig_heatmap.write_html(f"{results_dir}/error_heatmap.html",
                       config={'displayModeBar': True, 'displaylogo': False})

# 3. Save Scatter Plot
fig_scatter.write_html(f"{results_dir}/sentence_length_analysis.html",
                       config={'displayModeBar': True, 'displaylogo': False})

# 4. Save Box Plot Distribution
fig_dist.write_html(f"{results_dir}/accuracy_distributions.html",
                    config={'displayModeBar': True, 'displaylogo': False})

# 5. Save Original Accuracy Comparison (from earlier)
fig.write_html(f"{results_dir}/accuracy_comparison.html",
               config={'displayModeBar': True, 'displaylogo': False})

# 6. Create a comprehensive dashboard HTML file
dashboard_html = f"""
<!DOCTYPE html>
<html>
<head>
    <title>NLP Validation Results: Joyce's Dubliners</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }}
        .header {{ background-color: #2c3e50; color: white; padding: 20px; margin-bottom: 20px; }}
        .summary {{ background-color: white; padding: 20px; margin-bottom: 20px; border-radius: 5px; }}
        .chart-container {{ background-color: white; margin-bottom: 20px; padding: 15px; border-radius: 5px; }}
        .chart-link {{ display: inline-block; background-color: #3498db; color: white; padding: 10px 20px;
                       text-decoration: none; border-radius: 5px; margin: 5px; }}
        .chart-link:hover {{ background-color: #2980b9; }}
        .stats-table {{ width: 100%; border-collapse: collapse; }}
        .stats-table th, .stats-table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
        .stats-table th {{ background-color: #f2f2f2; }}
    </style>
</head>
<body>
    <div class="header">
        <h1>NLP Tagging Validation for Joyce's Dubliners</h1>
        <p>Statistical Analysis of Modern NLP Tools vs Expert CLAWS7 Annotations</p>
        <p>Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
    </div>

    <div class="summary">
        <h2>Executive Summary</h2>
        <p><strong>Research Question:</strong> How accurately do modern NLP tools perform on James Joyce's syntactically complex literary prose?</p>
        <p><strong>Methodology:</strong> Comparison of spaCy, Flair, and TextBlob against expert CLAWS7 annotations on {len(batch_results)} simile sentences from Dubliners.</p>

        <h3>Key Findings:</h3>
        <ul>
            <li><strong>Low Overall Accuracy:</strong> Best tool (Flair) achieved only 63.0% token-level accuracy [61.7%, 64.3%]</li>
            <li><strong>Rare Perfect Sentences:</strong> Only 1.1-1.6% of sentences tagged perfectly</li>
            <li><strong>Systematic Underperformance:</strong> All tools significantly below claimed 95%+ accuracy on standard text</li>
            <li><strong>Statistical Significance:</strong> Tool differences are significant but practically negligible (Cohen's h = 0.100)</li>
        </ul>
    </div>

    <div class="summary">
        <h2>Statistical Results Summary</h2>
        <table class="stats-table">
            <tr><th>Tool</th><th>Token Accuracy</th><th>95% CI Lower</th><th>95% CI Upper</th><th>Perfect Sentences</th></tr>
"""

for tool_name, stats in wilson_results.items():
    dashboard_html += f"""
            <tr>
                <td><strong>{tool_name.upper()}</strong></td>
                <td>{stats['token_accuracy']:.3f}</td>
                <td>{stats['token_ci_lower']:.3f}</td>
                <td>{stats['token_ci_upper']:.3f}</td>
                <td>{stats['perfect_rate']:.1%}</td>
            </tr>
"""

dashboard_html += f"""
        </table>
        <p><strong>Sample Size:</strong> {wilson_results[list(wilson_results.keys())[0]]['total_tokens']:,} tokens across {len(batch_results)} sentences</p>
    </div>

    <div class="chart-container">
        <h2>Interactive Visualizations</h2>
        <p>Click on any chart below to open the full interactive version:</p>

        <a href="confidence_intervals.html" class="chart-link"> Confidence Intervals</a>
        <a href="error_heatmap.html" class="chart-link"> Error Heatmap</a>
        <a href="sentence_length_analysis.html" class="chart-link"> Length vs Accuracy</a>
        <a href="accuracy_distributions.html" class="chart-link"> Accuracy Distributions</a>
        <a href="accuracy_comparison.html" class="chart-link"> Tool Comparison</a>
    </div>

    <div class="summary">
        <h2>Critical Analysis</h2>
        <p>These Wilson confidence interval results provide compelling empirical evidence for the methodological concerns raised in computational literary studies debates. With token-level accuracies ranging from 58.2% [56.8%, 59.5%] to 63.0% [61.7%, 64.3%] across 5,437 tokens, modern NLP tools demonstrate systematic underperformance on Joyce's literary prose compared to their claimed 95%+ accuracy on standard text. The extraordinarily low perfect sentence rates (1.1-1.6%) with wide confidence intervals [0.3%, 4.7%] reveal that flawless automated tagging of Joyce's syntactically complex sentences is statistically rare, occurring in fewer than 1 in 20 cases. While Flair's superiority over spaCy achieves statistical significance (p < 0.001), the negligible effect size (Cohen's h = 0.100) demonstrates that technological improvements yield practically minimal gains when confronting modernist literary language. This statistical validation supports Da's critique that computational literary analysis faces fundamental limitations with complex literary texts, while simultaneously validating Wallis's methodological framework for robust corpus linguistic research. The consistent underperformance across all tools—despite their neural architectures and contextual embeddings—suggests that Joyce's stylistic innovations create systematic challenges for automated linguistic analysis that transcend individual algorithmic approaches. These findings provide quantitative evidence that expert linguistic annotation remains essential for literary corpus analysis, particularly when dealing with texts that deliberately exploit syntactic ambiguity and narrative voice complexity as aesthetic strategies.</p>
    </div>

    <div class="summary">
        <h2>Most Common Error Patterns</h2>
        <ul>
            <li><strong>PPHS1→PPIS1:</strong> 152-153 occurrences (3rd person pronouns misclassified as 1st person)</li>
            <li><strong>IO→II:</strong> 105-119 occurrences ("of" preposition misclassified as general preposition)</li>
            <li><strong>VVI→VV0:</strong> 95-102 occurrences (infinitive verbs misclassified as base form)</li>
        </ul>
        <p>These systematic errors across all tools suggest fundamental challenges with Joyce's indirect free discourse and syntactic complexity.</p>
    </div>

    <div class="summary">
        <h2>Technical Details</h2>
        <p><strong>Confidence Intervals:</strong> Wilson score intervals used for robust estimation with small samples</p>
        <p><strong>Statistical Tests:</strong> Two-proportion z-tests for significance testing between tools</p>
        <p><strong>Effect Size:</strong> Cohen's h for meaningful difference assessment</p>
        <p><strong>Corpus:</strong> Expert CLAWS7 annotations from close reading analysis of Dubliners similes</p>
    </div>
</body>
</html>
"""

# Save dashboard
with open(f"{results_dir}/index.html", "w", encoding="utf-8") as f:
    f.write(dashboard_html)

# Create a ZIP file for easy download
import zipfile

zip_filename = f"nlp_validation_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"

with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for root, dirs, files in os.walk(results_dir):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, results_dir)
            zipf.write(file_path, arcname)

print(f"All visualizations saved!")
print(f"Files saved to: {results_dir}/")
print(f"Open index.html in your browser for the complete dashboard")
print(f"Download {zip_filename} for all files")

# Download the ZIP file in Colab
from google.colab import files
files.download(zip_filename)

print(f"\\n Browser Instructions:")
print(f"1. Download and extract {zip_filename}")
print(f"2. Open 'index.html' in any web browser")
print(f"3. Click on individual chart links for interactive visualizations")
print(f"4. All charts are fully interactive with zoom, pan, and hover features")