# Interactive Visualization Framework for Joyce Simile Analysis
## Notebook 3: Advanced Data Visualization

This notebook creates comprehensive interactive visualizations for Joyce simile research, including:

### Visualization Components:
- **Interactive Network Graphs** (Category-Comparator Relationships)
- **Wilson Score Confidence Intervals** (Statistical validation)
- **Topic Modeling Visualizations** (Thematic analysis)
- **Bee Swarm Plots** (Token distribution patterns)
- **Multi-dimensional Heatmaps** (Linguistic feature analysis)
- **Joyce vs BNC Comparative Analysis** (Innovation vs standard usage)
- **Interactive Dashboard** (Complete presentation framework)

### Research Focus:
Demonstrates Joyce's stylistic innovations through computational analysis and statistical validation, comparing simile patterns in *Dubliners* against British National Corpus baseline.

### Dependencies:
Requires: `comprehensive_linguistic_analysis.csv` (output from Notebook 2)

### Outputs:
- Interactive HTML visualizations
- Complete dashboard for thesis presentation
- Publication-ready figures with statistical validation

In [7]:
# =============================================================================
# INTERACTIVE VISUALIZATION FRAMEWORK FOR JOYCE SIMILE ANALYSIS
# =============================================================================

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo
import networkx as nx
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

print("INTERACTIVE VISUALIZATION FRAMEWORK FOR JOYCE SIMILE ANALYSIS")
print("=" * 65)

class JoyceSimileVisualizer:
    """Advanced visualization framework for Joyce simile research."""

    def __init__(self, data_path="comprehensive_linguistic_analysis.csv"):
        self.data_path = data_path
        self.df = None
        self.figures = {}
        self.load_data()

INTERACTIVE VISUALIZATION FRAMEWORK FOR JOYCE SIMILE ANALYSIS


In [8]:
def load_data(self):
        """Load and prepare data for visualization."""
        print("\nLOADING COMPREHENSIVE ANALYSIS DATA")
        print("-" * 37)

        try:
            self.df = pd.read_csv(self.data_path)
            print(f"Data loaded successfully: {len(self.df)} instances")

            # Clean and prepare data
            self.df['Dataset'] = self.df['Original_Dataset'].fillna(self.df['Dataset_Source'])
            self.df = self.df.dropna(subset=['Category_Framework'])

            print(f"Data prepared for visualization: {len(self.df)} instances")

        except Exception as e:
            print(f"Error loading data: {e}")
            return None

In [32]:
# Load data for network graph
df = pd.read_csv("comprehensive_linguistic_analysis.csv")
df['Dataset'] = df['Original_Dataset'].fillna(df['Dataset_Source'])
df = df.dropna(subset=['Category_Framework'])

def create_network_graph(df):

    print("\nCREATING INTERACTIVE NETWORK GRAPH")
    print("-" * 36)

    # Create network graph
    G = nx.Graph()

    # Initialize edge_weights dictionary
    edge_weights = {}

    # Add nodes and calculate edge weights
    for _, row in df.iterrows():
        category = f"Cat_{row['Category_Framework']}"
        comparator = f"Comp_{row['Comparator_Type']}"
        dataset = f"Data_{row['Dataset']}"

        # Add nodes
        G.add_node(category, type='category', size=10)
        G.add_node(comparator, type='comparator', size=8)
        G.add_node(dataset, type='dataset', size=12)

        # Define edges for this row
        edges = [(category, comparator), (category, dataset), (comparator, dataset)]

        # Calculate edge weights
        for edge in edges:
            if edge in edge_weights:
                edge_weights[edge] += 1
            else:
                edge_weights[edge] = 1

    # Add edges to graph (without weighting for visualization in this version, but weights are calculated)
    for (node1, node2), weight in edge_weights.items():
        G.add_edge(node1, node2, weight=weight)


    # Calculate layout - Using spring_layout with adjusted k
    layout_k = 3.5 # Slightly increase optimal distance
    layout_iterations = 50
    layout_seed = 42

    print(f"Calculating layout with spring_layout (k={layout_k}, iterations={layout_iterations}, seed={layout_seed})")
    pos = nx.spring_layout(G, k=layout_k, iterations=layout_iterations, seed=layout_seed)


    # Prepare traces for Plotly
    edge_x = []
    edge_y = []

    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])


    # Create node traces by type
    # Updated colors as requested: bright green for comparator, light pink for dataset, bright orange for category
    colors = {'category': '#FF7F00', 'comparator': '#39FF14', 'dataset': '#FFB6C1'} # Changed dataset color to light pink


    fig = go.Figure()

    # Add edges (without weighting)
    fig.add_trace(go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=1, color='#888'), # Reverted to fixed width
        hoverinfo='none',
        mode='lines',
        showlegend=False
    ))

    # Add nodes by type
    for node_type in ['category', 'comparator', 'dataset']:
        node_x = []
        node_y = []
        node_text = []
        node_info = []

        for node in G.nodes():
            if G.nodes[node]['type'] == node_type:
                x, y = pos[node]
                node_x.append(x)
                node_y.append(y)

                # Clean node name for display
                display_name = node.replace('Cat_', '').replace('Comp_', '').replace('Data_', '')
                node_text.append(display_name)

                # Calculate node degree for info
                degree = G.degree(node)
                node_info.append(f"{display_name}<br>Connections: {degree}")

        fig.add_trace(go.Scatter(
            x=node_x, y=node_y,
            mode='markers+text',
            text=node_text,
            textposition="middle center",
            hovertext=node_info,
            hoverinfo='text',
            marker=dict(
                size=[G.degree(node) * 3 + 10 for node in G.nodes() if G.nodes[node]['type'] == node_type],
                color=colors[node_type],
                line=dict(width=2, color='white')
            ),
            name=node_type.title()
        ))

    fig.update_layout(
        title={
            'text': "Interactive Network Graph: Simile Categories, Comparators & Datasets",
            'x': 0.5,
            'font': {'size': 16}
        },
        showlegend=True,
        hovermode='closest',
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        plot_bgcolor='white',
        height=600
    )

    # Save and display
    filename = "joyce_simile_network_graph.html"
    fig.write_html(filename)
    fig.show()

    # Add the download instruction
    from google.colab import files
    files.download(filename)


    print("Interactive network graph created and saved")
    return fig

# Execute network graph creation
network_fig = create_network_graph(df)


CREATING INTERACTIVE NETWORK GRAPH
------------------------------------
Calculating layout with spring_layout (k=3.5, iterations=50, seed=42)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Interactive network graph created and saved


In [15]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.offline as pyo
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')
from google.colab import files # Import files module for download


def create_corrected_category_analysis(df):
    """Create corrected category-dataset analysis showing proper distributions."""
    print("\nCREATING CORRECTED CATEGORY-DATASET ANALYSIS")
    print("-" * 47)

    # Debug: Check the actual data distribution
    print("Dataset distribution:")
    print(df['Dataset'].value_counts())
    print("\nCategory distribution by dataset:")
    print(df.groupby(['Dataset', 'Category_Framework']).size().unstack(fill_value=0))

    # Create side-by-side comparison
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Category Distribution by Dataset (Stacked)',
            'Dataset Distribution by Category',
            'Proportional Analysis (Joyce vs BNC)',
            'Top Comparators by Dataset'
        ),
        specs=[[{"type": "bar"}, {"type": "bar"}],
               [{"type": "bar"}, {"type": "bar"}]]
    )

    # 1. Stacked bar chart by dataset
    category_dataset = df.groupby(['Dataset', 'Category_Framework']).size().reset_index(name='count')

    datasets = ['manual', 'computational', 'bnc']
    colors_cat = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FECA57']

    for i, category in enumerate(df['Category_Framework'].unique()):
        cat_data = category_dataset[category_dataset['Category_Framework'] == category]

        # Ensure all datasets are represented
        dataset_counts = []
        for dataset in datasets:
            count = cat_data[cat_data['Dataset'] == dataset]['count'].sum()
            dataset_counts.append(count)

        fig.add_trace(go.Bar(
            x=datasets,
            y=dataset_counts,
            name=category,
            marker_color=colors_cat[i % len(colors_cat)],
            hovertemplate=f"<b>{category}</b><br>" +
                         "Dataset: %{x}<br>" +
                         "Count: %{y}<br>" +
                         "<extra></extra>"
        ), row=1, col=1)

    # 2. Dataset distribution by category
    for i, dataset in enumerate(datasets):
        dataset_data = category_dataset[category_dataset['Dataset'] == dataset]

        fig.add_trace(go.Bar(
            x=dataset_data['Category_Framework'],
            y=dataset_data['count'],
            name=f"{dataset.title()}",
            marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1'][i],
            showlegend=False,
            hovertemplate=f"<b>{dataset.title()}</b><br>" +
                         "Category: %{x}<br>" +
                         "Count: %{y}<br>" +
                         "<extra></extra>"
        ), row=1, col=2)

    # 3. Proportional analysis (Joyce combined vs BNC)
    joyce_df = df[df['Dataset'].isin(['manual', 'computational'])]
    bnc_df = df[df['Dataset'] == 'bnc']

    joyce_props = joyce_df['Category_Framework'].value_counts(normalize=True)
    bnc_props = bnc_df['Category_Framework'].value_counts(normalize=True)

    all_categories = sorted(set(joyce_props.index) | set(bnc_props.index))

    joyce_values = [joyce_props.get(cat, 0) for cat in all_categories]
    bnc_values = [bnc_props.get(cat, 0) for cat in all_categories]

    fig.add_trace(go.Bar(
        x=all_categories,
        y=joyce_values,
        name="Joyce (Manual + Computational)",
        marker_color='#FF6B6B',
        showlegend=False,
        hovertemplate="<b>Joyce</b><br>" +
                     "Category: %{x}<br>" +
                     "Proportion: %{y:.3f}<br>" +
                     "<extra></extra>"
    ), row=2, col=1)

    fig.add_trace(go.Bar(
        x=all_categories,
        y=bnc_values,
        name="BNC Baseline",
        marker_color='#45B7D1',
        showlegend=False,
        hovertemplate="<b>BNC</b><br>" +
                     "Category: %{x}<br>" +
                     "Proportion: %{y:.3f}<br>" +
                     "<extra></extra>"
    ), row=2, col=1)

    # 4. Top comparators by dataset
    top_comparators = df['Comparator_Type'].value_counts().head(5).index

    for i, dataset in enumerate(datasets):
        dataset_comps = df[df['Dataset'] == dataset]['Comparator_Type'].value_counts()

        # Filter to top comparators only
        filtered_comps = dataset_comps[dataset_comps.index.isin(top_comparators)]

        fig.add_trace(go.Bar(
            x=filtered_comps.index,
            y=filtered_comps.values,
            name=f"{dataset.title()}",
            marker_color=['#FF6B6B', '#4ECDC4', '#45B7D1'][i],
            showlegend=False,
            hovertemplate=f"<b>{dataset.title()}</b><br>" +
                         "Comparator: %{x}<br>" +
                         "Count: %{y}<br>" +
                         "<extra></extra>"
        ), row=2, col=2)

    # Update layout
    fig.update_layout(
        title={
            'text': "Comprehensive Category-Dataset Analysis: Joyce vs BNC Patterns",
            'x': 0.5,
            'font': {'size': 16}
        },
        height=800,
        barmode='group'  # This ensures bars are grouped, not stacked
    )

    # Update axis labels
    fig.update_xaxes(title_text="Dataset", row=1, col=1)
    fig.update_yaxes(title_text="Count", row=1, col=1)

    fig.update_xaxes(title_text="Category", row=1, col=2)
    fig.update_yaxes(title_text="Count", row=1, col=2)

    fig.update_xaxes(title_text="Category", row=2, col=1)
    fig.update_yaxes(title_text="Proportion", row=2, col=1)

    fig.update_xaxes(title_text="Comparator", row=2, col=2)
    fig.update_yaxes(title_text="Count", row=2, col=2)

    filename = "joyce_corrected_analysis.html"
    fig.write_html(filename)
    fig.show()

    # Add the download instruction
    from google.colab import files
    files.download(filename)


    print("Interactive network graph created and saved")
    return fig

# Create corrected analysis
corrected_fig = create_corrected_category_analysis(df)


CREATING CORRECTED CATEGORY-DATASET ANALYSIS
-----------------------------------------------
Dataset distribution:
Dataset
computational    218
bnc              200
manual           183
Name: count, dtype: int64

Category distribution by dataset:
Category_Framework  Joycean_Framed  Joycean_Quasi  Joycean_Quasi_Fuzzy  \
Dataset                                                                  
bnc                              0              0                    0   
computational                    4             47                   14   
manual                          18             53                   13   

Category_Framework  Joycean_Silent  Standard  
Dataset                                       
bnc                              0       200  
computational                    3       150  
manual                           6        93  


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Interactive network graph created and saved


In [None]:
def create_wilson_f1_visualization(simile_data):

    print("\nCREATING WILSON SCORE & F1 SCORE VISUALIZATION")
    print("-" * 48)

    # Calculate Wilson Score intervals
    confidence_level = 0.95
    z_score = stats.norm.ppf((1 + confidence_level) / 2)

    # Prepare data for Wilson intervals
    interval_data = []

    for dataset in simile_data['Dataset'].unique():
        dataset_df = simile_data[simile_data['Dataset'] == dataset]
        category_counts = dataset_df['Category_Framework'].value_counts()
        total = len(dataset_df)

        for category, count in category_counts.items():
            p = count / total
            n = total

            # Wilson Score Interval calculation
            denominator = 1 + z_score**2 / n
            center = (p + z_score**2 / (2*n)) / denominator
            width = z_score * np.sqrt(p*(1-p)/n + z_score**2/(4*n**2)) / denominator

            lower_bound = max(0, center - width)
            upper_bound = min(1, center + width)

            interval_data.append({
                'Dataset': dataset,
                'Category': category,
                'Proportion': p,
                'Lower_CI': lower_bound,
                'Upper_CI': upper_bound,
                'Count': count,
                'Total': total,
                'Interval_Width': upper_bound - lower_bound
            })

    intervals_df = pd.DataFrame(interval_data)

    # Calculate F1 scores
    manual_df = simile_data[simile_data['Dataset'] == 'manual']
    comp_df = simile_data[simile_data['Dataset'] == 'computational']

    f1_data = []
    if len(manual_df) > 0 and len(comp_df) > 0:
        manual_cats = manual_df['Category_Framework'].value_counts()
        comp_cats = comp_df['Category_Framework'].value_counts()

        all_categories = sorted(set(manual_cats.index) | set(comp_cats.index))

        for category in all_categories:
            manual_count = manual_cats.get(category, 0)
            comp_count = comp_cats.get(category, 0)

            if comp_count > 0:
                precision = min(manual_count / comp_count, 1.0)
            else:
                precision = 0.0

            if manual_count > 0:
                recall = min(comp_count / manual_count, 1.0)
            else:
                recall = 0.0

            if precision + recall > 0:
                f1 = 2 * (precision * recall) / (precision + recall)
            else:
                f1 = 0.0

            f1_data.append({
                'Category': category,
                'Precision': precision,
                'Recall': recall,
                'F1_Score': f1,
                'Manual_Count': manual_count,
                'Computational_Count': comp_count
            })

    f1_df = pd.DataFrame(f1_data)

    # Create figure
    fig = make_subplots(
        rows=3, cols=2,
        subplot_titles=(
            'Wilson Score Confidence Intervals by Dataset',
            'F1 Performance Metrics by Category',
            'Category Proportions with Error Bars',
            'Precision vs Recall Analysis',
            'Confidence Interval Widths',
            'Algorithm Performance Summary'
        ),
        specs=[[{"type": "scatter"}, {"type": "bar"}],
               [{"type": "scatter"}, {"type": "scatter"}],
               [{"type": "bar"}, {"type": "table"}]]
    )

    colors = {'manual': '#FF6B6B', 'computational': '#4ECDC4', 'bnc': '#45B7D1'}

    # 1. Wilson Score Confidence Intervals
    for i, dataset in enumerate(intervals_df['Dataset'].unique()):
        dataset_data = intervals_df[intervals_df['Dataset'] == dataset]
        dataset_data = dataset_data.sort_values('Proportion', ascending=True)

        y_positions = np.arange(len(dataset_data)) + i * 0.3

        fig.add_trace(go.Scatter(
            x=dataset_data['Proportion'],
            y=y_positions,
            mode='markers',
            marker=dict(
                size=12,
                color=colors.get(dataset, '#666666'),
                symbol='circle'
            ),
            name=f"{dataset.title()}",
            hovertemplate="<b>%{text}</b><br>" +
                         f"Dataset: {dataset}<br>" +
                         "Proportion: %{x:.3f}<br>" +
                         "95% CI: [%{customdata[0]:.3f}, %{customdata[1]:.3f}]<br>" +
                         "Count: %{customdata[2]} / %{customdata[3]}<br>" +
                         "<extra></extra>",
            text=dataset_data['Category'],
            customdata=dataset_data[['Lower_CI', 'Upper_CI', 'Count', 'Total']].values,
            showlegend=(i == 0)
        ), row=1, col=1)

        # Add confidence interval lines
        for j, (_, row) in enumerate(dataset_data.iterrows()):
            fig.add_shape(
                type="line",
                x0=row['Lower_CI'], y0=y_positions[j],
                x1=row['Upper_CI'], y1=y_positions[j],
                line=dict(color=colors.get(dataset, '#666666'), width=3),
                row=1, col=1
            )

    # 2. F1 Performance Metrics
    if len(f1_df) > 0:
        # F1 Score bars
        fig.add_trace(go.Bar(
            x=f1_df['Category'],
            y=f1_df['F1_Score'],
            name='F1 Score',
            marker_color='#FF6B6B',
            text=f1_df['F1_Score'].round(3),
            textposition='auto',
            hovertemplate="<b>%{x}</b><br>" +
                         "F1 Score: %{y:.3f}<br>" +
                         "Manual Count: %{customdata[0]}<br>" +
                         "Computational Count: %{customdata[1]}<br>" +
                         "<extra></extra>",
            customdata=f1_df[['Manual_Count', 'Computational_Count']].values,
            showlegend=True
        ), row=1, col=2)

        # Precision bars
        fig.add_trace(go.Bar(
            x=f1_df['Category'],
            y=f1_df['Precision'],
            name='Precision',
            marker_color='#4ECDC4',
            opacity=0.7,
            showlegend=True,
            hovertemplate="<b>%{x}</b><br>" +
                         "Precision: %{y:.3f}<br>" +
                         "<extra></extra>"
        ), row=1, col=2)

        # Recall bars
        fig.add_trace(go.Bar(
            x=f1_df['Category'],
            y=f1_df['Recall'],
            name='Recall',
            marker_color='#45B7D1',
            opacity=0.7,
            showlegend=True,
            hovertemplate="<b>%{x}</b><br>" +
                         "Recall: %{y:.3f}<br>" +
                         "<extra></extra>"
        ), row=1, col=2)

    # 3. Category Proportions with Error Bars
    for dataset in intervals_df['Dataset'].unique():
        dataset_data = intervals_df[intervals_df['Dataset'] == dataset]

        error_y = dict(
            type='data',
            symmetric=False,
            array=dataset_data['Upper_CI'] - dataset_data['Proportion'],
            arrayminus=dataset_data['Proportion'] - dataset_data['Lower_CI'],
            color=colors.get(dataset, '#666666'),
            thickness=2,
            width=3
        )

        fig.add_trace(go.Scatter(
            x=dataset_data['Category'],
            y=dataset_data['Proportion'],
            mode='markers',
            marker=dict(
                size=10,
                color=colors.get(dataset, '#666666')
            ),
            error_y=error_y,
            name=f"{dataset.title()} ±95% CI",
            showlegend=False,
            hovertemplate="<b>%{x}</b><br>" +
                         f"Dataset: {dataset}<br>" +
                         "Proportion: %{y:.3f}<br>" +
                         "<extra></extra>"
        ), row=2, col=1)

    # 4. Precision vs Recall scatter - WITH SHORTENED COLORBAR
    if len(f1_df) > 0:
        fig.add_trace(go.Scatter(
            x=f1_df['Recall'],
            y=f1_df['Precision'],
            mode='markers+text',
            text=f1_df['Category'],
            textposition="top center",
            marker=dict(
                size=f1_df['F1_Score'] * 30 + 10,
                color=f1_df['F1_Score'],
                colorscale='RdYlBu_r',
                showscale=True,
                colorbar=dict(title="F1 Score", len=0.3)  # SHORTENED COLORBAR
            ),
            name='Categories',
            showlegend=False,
            hovertemplate="<b>%{text}</b><br>" +
                         "Precision: %{y:.3f}<br>" +
                         "Recall: %{x:.3f}<br>" +
                         "F1 Score: %{marker.color:.3f}<br>" +
                         "<extra></extra>"
        ), row=2, col=2)

        # Add diagonal reference line
        fig.add_trace(go.Scatter(
            x=[0, 1],
            y=[0, 1],
            mode='lines',
            line=dict(dash='dash', color='gray'),
            showlegend=False,
            hoverinfo='skip'
        ), row=2, col=2)

    # 5. Confidence Interval Widths
    interval_widths = intervals_df.groupby('Dataset')['Interval_Width'].mean().reset_index()

    fig.add_trace(go.Bar(
        x=interval_widths['Dataset'],
        y=interval_widths['Interval_Width'],
        marker_color=[colors.get(d, '#666666') for d in interval_widths['Dataset']],
        text=interval_widths['Interval_Width'].round(3),
        textposition='auto',
        showlegend=False,
        hovertemplate="<b>%{x}</b><br>" +
                     "Avg CI Width: %{y:.3f}<br>" +
                     "<extra></extra>"
    ), row=3, col=1)

    # 6. Performance Summary Table
    if len(f1_df) > 0:
        overall_f1 = f1_df['F1_Score'].mean()
        overall_precision = f1_df['Precision'].mean()
        overall_recall = f1_df['Recall'].mean()

        if overall_f1 > 0.8:
            performance = "Excellent"
        elif overall_f1 > 0.6:
            performance = "Good"
        elif overall_f1 > 0.4:
            performance = "Moderate"
        else:
            performance = "Poor"

        summary_data = [
            ['Metric', 'Value', 'Interpretation'],
            ['Overall F1 Score', f'{overall_f1:.3f}', performance],
            ['Average Precision', f'{overall_precision:.3f}', 'Algorithm Accuracy'],
            ['Average Recall', f'{overall_recall:.3f}', 'Coverage Completeness'],
            ['Categories Analyzed', str(len(f1_df)), 'Theoretical Framework'],
            ['Manual Instances', str(len(manual_df)), 'Ground Truth Size'],
            ['Computational Instances', str(len(comp_df)), 'Algorithm Output']
        ]

        fig.add_trace(go.Table(
            header=dict(
                values=summary_data[0],
                fill_color='lightgray',
                align='center',
                font=dict(size=12, color='black')
            ),
            cells=dict(
                values=list(zip(*summary_data[1:])),
                fill_color='white',
                align=['center', 'center', 'left'],
                font=dict(size=11)
            )
        ), row=3, col=2)

    # Update layout
    fig.update_layout(
        title={
            'text': "Statistical Validation: Wilson Score Intervals & F1 Performance Analysis",
            'x': 0.5,
            'font': {'size': 18}
        },
        height=1200,
        showlegend=True
    )

    # Update axis labels
    fig.update_xaxes(title_text="Proportion", row=1, col=1)
    fig.update_yaxes(title_text="Categories", row=1, col=1)
    fig.update_xaxes(title_text="Categories", row=1, col=2)
    fig.update_yaxes(title_text="Score", row=1, col=2)
    fig.update_xaxes(title_text="Categories", row=2, col=1)
    fig.update_yaxes(title_text="Proportion", row=2, col=1)
    fig.update_xaxes(title_text="Recall", row=2, col=2)
    fig.update_yaxes(title_text="Precision", row=2, col=2)
    fig.update_xaxes(title_text="Dataset", row=3, col=1)
    fig.update_yaxes(title_text="Avg CI Width", row=3, col=1)

    filename = "joyce_wilson_f1_analysis.html"
    fig.write_html(filename)
    fig.show()

    # Add the download instruction
    from google.colab import files
    files.download(filename)


    print("Wilson Score & F1 visualization created and downloaded successfully")
    return fig, intervals_df, f1_df

# Execute Wilson Score and F1 visualization
wilson_f1_fig, intervals_data, f1_results = create_wilson_f1_visualization(df)


CREATING WILSON SCORE & F1 SCORE VISUALIZATION
------------------------------------------------


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Wilson Score & F1 visualization created and downloaded successfully


In [None]:
def create_token_distribution_analysis(simile_data):
    """
    Analyze token distribution patterns across Joyce and BNC datasets.

    This visualization reveals structural differences in simile construction
    between Joyce's innovative patterns and standard English usage.
    """
    print("\nANALYZING TOKEN DISTRIBUTION PATTERNS")
    print("-" * 39)

    # Filter to valid token data and create working dataset
    valid_tokens = simile_data[
        (simile_data['Pre_Comparator_Tokens'].notna()) &
        (simile_data['Post_Comparator_Tokens'].notna()) &
        (simile_data['Total_Tokens'].notna())
    ].copy()

    print(f"Valid token data: {len(valid_tokens)} similes")

    # Calculate derived metrics for analysis
    valid_tokens['structure_type'] = valid_tokens['Pre_Post_Ratio'].apply(
        lambda x: 'Front-Heavy' if x > 1.2 else ('Back-Heavy' if x < 0.8 else 'Balanced')
    )

    # Create comprehensive token analysis figure
    token_fig = make_subplots(
        rows=2, cols=3,
        subplot_titles=(
            'Pre-Comparator Token Distribution',
            'Post-Comparator Token Distribution',
            'Pre/Post Ratio Patterns',
            'Total Length by Dataset',
            'Structural Types Distribution',
            'Length vs Complexity Analysis'
        ),
        specs=[[{"type": "box"}, {"type": "box"}, {"type": "violin"}],
               [{"type": "box"}, {"type": "bar"}, {"type": "scatter"}]]
    )

    dataset_colors = {'manual': '#E74C3C', 'computational': '#3498DB', 'bnc': '#2ECC71'}
    dataset_names = ['manual', 'computational', 'bnc']

    # 1. Pre-comparator token distribution with bee swarm effect
    for dataset_idx, dataset_name in enumerate(dataset_names):
        dataset_subset = valid_tokens[valid_tokens['Dataset'] == dataset_name]

        if len(dataset_subset) > 0:
            token_fig.add_trace(go.Box(
                y=dataset_subset['Pre_Comparator_Tokens'],
                name=f"{dataset_name.title()}",
                boxpoints='all',  # Shows all points like bee swarm
                pointpos=0,       # Centers points on box
                jitter=0.5,       # Adds horizontal spread
                marker=dict(
                    color=dataset_colors[dataset_name],
                    size=4,
                    opacity=0.6
                ),
                line=dict(color=dataset_colors[dataset_name]),
                showlegend=(dataset_idx == 0),
                hovertemplate=f"<b>{dataset_name.title()}</b><br>" +
                             "Pre-tokens: %{y}<br>" +
                             "<extra></extra>"
            ), row=1, col=1)

    # 2. Post-comparator token distribution
    for dataset_name in dataset_names:
        dataset_subset = valid_tokens[valid_tokens['Dataset'] == dataset_name]

        if len(dataset_subset) > 0:
            token_fig.add_trace(go.Box(
                y=dataset_subset['Post_Comparator_Tokens'],
                name=f"{dataset_name.title()}",
                boxpoints='all',
                pointpos=0,
                jitter=0.5,
                marker=dict(
                    color=dataset_colors[dataset_name],
                    size=4,
                    opacity=0.6
                ),
                line=dict(color=dataset_colors[dataset_name]),
                showlegend=False,
                hovertemplate=f"<b>{dataset_name.title()}</b><br>" +
                             "Post-tokens: %{y}<br>" +
                             "<extra></extra>"
            ), row=1, col=2)

    # 3. Pre/Post ratio patterns using violin plots for density
    for dataset_name in dataset_names:
        dataset_subset = valid_tokens[valid_tokens['Dataset'] == dataset_name]
        ratio_data = dataset_subset[dataset_subset['Pre_Post_Ratio'].notna()]

        if len(ratio_data) > 0:
            token_fig.add_trace(go.Violin(
                y=ratio_data['Pre_Post_Ratio'],
                name=f"{dataset_name.title()}",
                side='positive' if dataset_name != 'bnc' else 'negative',
                line_color=dataset_colors[dataset_name],
                fillcolor=dataset_colors[dataset_name],
                opacity=0.6,
                showlegend=False,
                hovertemplate=f"<b>{dataset_name.title()}</b><br>" +
                             "Ratio: %{y:.2f}<br>" +
                             "<extra></extra>"
            ), row=1, col=3)

    # 4. Total length distribution
    for dataset_name in dataset_names:
        dataset_subset = valid_tokens[valid_tokens['Dataset'] == dataset_name]

        if len(dataset_subset) > 0:
            token_fig.add_trace(go.Box(
                y=dataset_subset['Total_Tokens'],
                name=f"{dataset_name.title()}",
                boxpoints='outliers',  # Only show outliers for cleaner view
                marker=dict(color=dataset_colors[dataset_name]),
                line=dict(color=dataset_colors[dataset_name]),
                showlegend=False,
                hovertemplate=f"<b>{dataset_name.title()}</b><br>" +
                             "Total tokens: %{y}<br>" +
                             "<extra></extra>"
            ), row=2, col=1)

    # 5. Structural types distribution
    structure_counts = valid_tokens.groupby(['Dataset', 'structure_type']).size().reset_index(name='count')

    for structure_type in ['Front-Heavy', 'Balanced', 'Back-Heavy']:
        type_data = structure_counts[structure_counts['structure_type'] == structure_type]

        structure_colors = {'Front-Heavy': '#E67E22', 'Balanced': '#9B59B6', 'Back-Heavy': '#1ABC9C'}

        token_fig.add_trace(go.Bar(
            x=type_data['Dataset'],
            y=type_data['count'],
            name=structure_type,
            marker_color=structure_colors[structure_type],
            showlegend=(structure_type == 'Front-Heavy'),
            hovertemplate=f"<b>{structure_type}</b><br>" +
                         "Dataset: %{x}<br>" +
                         "Count: %{y}<br>" +
                         "<extra></extra>"
        ), row=2, col=2)

    # 6. Length vs complexity analysis (scatter plot)
    complexity_data = valid_tokens[valid_tokens['Syntactic_Complexity'].notna()]

    if len(complexity_data) > 0:
        for dataset_name in dataset_names:
            dataset_complexity = complexity_data[complexity_data['Dataset'] == dataset_name]

            if len(dataset_complexity) > 0:
                token_fig.add_trace(go.Scatter(
                    x=dataset_complexity['Total_Tokens'],
                    y=dataset_complexity['Syntactic_Complexity'],
                    mode='markers',
                    name=f"{dataset_name.title()}",
                    marker=dict(
                        color=dataset_colors[dataset_name],
                        size=6,
                        opacity=0.7
                    ),
                    showlegend=False,
                    hovertemplate=f"<b>{dataset_name.title()}</b><br>" +
                                 "Length: %{x} tokens<br>" +
                                 "Complexity: %{y}<br>" +
                                 "<extra></extra>"
                ), row=2, col=3)

    # Update layout with proper spacing to prevent overlap
    token_fig.update_layout(
        title={
            'text': "Structural Analysis: Token Distribution Patterns in Joyce vs BNC Similes",
            'x': 0.5,
            'font': {'size': 16}
        },
        height=900,  # Increased height to prevent overlap
        showlegend=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )

    # Update axis labels for clarity
    token_fig.update_yaxes(title_text="Token Count", row=1, col=1)
    token_fig.update_yaxes(title_text="Token Count", row=1, col=2)
    token_fig.update_yaxes(title_text="Pre/Post Ratio", row=1, col=3)
    token_fig.update_yaxes(title_text="Total Tokens", row=2, col=1)
    token_fig.update_yaxes(title_text="Count", row=2, col=2)
    token_fig.update_xaxes(title_text="Dataset", row=2, col=2)
    token_fig.update_xaxes(title_text="Total Tokens", row=2, col=3)
    token_fig.update_yaxes(title_text="Syntactic Complexity", row=2, col=3)

    # Calculate and display summary statistics
    print("\nStructural Pattern Analysis:")

    for dataset_name in dataset_names:
        dataset_stats = valid_tokens[valid_tokens['Dataset'] == dataset_name]

        if len(dataset_stats) > 0:
            avg_pre = dataset_stats['Pre_Comparator_Tokens'].mean()
            avg_post = dataset_stats['Post_Comparator_Tokens'].mean()
            avg_ratio = dataset_stats['Pre_Post_Ratio'].mean()

            # Determine structural tendency
            if avg_ratio > 1.2:
                tendency = "front-heavy (longer setup)"
            elif avg_ratio < 0.8:
                tendency = "back-heavy (longer elaboration)"
            else:
                tendency = "balanced structure"

            print(f"\n{dataset_name.upper()} Dataset Analysis:")
            print(f"  Average pre-comparator: {avg_pre:.2f} tokens")
            print(f"  Average post-comparator: {avg_post:.2f} tokens")
            print(f"  Structure tendency: {tendency} (ratio: {avg_ratio:.2f})")

            # Count structural types
            structure_dist = dataset_stats['structure_type'].value_counts()
            print(f"  Distribution: {dict(structure_dist)}")

    token_fig.write_html("joyce_token_distribution_analysis.html")
    token_fig.show()

    print("\nToken distribution analysis completed successfully")
    return token_fig, valid_tokens

# Execute token distribution analysis
distribution_figure, token_analysis_data = create_token_distribution_analysis(df)


ANALYZING TOKEN DISTRIBUTION PATTERNS
---------------------------------------
Valid token data: 601 similes

Structural Pattern Analysis:

MANUAL Dataset Analysis:
  Average pre-comparator: 13.10 tokens
  Average post-comparator: 13.03 tokens
  Structure tendency: front-heavy (longer setup) (ratio: 2.46)
  Distribution: {'Front-Heavy': np.int64(75), 'Back-Heavy': np.int64(68), 'Balanced': np.int64(40)}

COMPUTATIONAL Dataset Analysis:
  Average pre-comparator: 12.12 tokens
  Average post-comparator: 11.22 tokens
  Structure tendency: front-heavy (longer setup) (ratio: 2.15)
  Distribution: {'Back-Heavy': np.int64(96), 'Front-Heavy': np.int64(74), 'Balanced': np.int64(48)}

BNC Dataset Analysis:
  Average pre-comparator: 12.03 tokens
  Average post-comparator: 9.85 tokens
  Structure tendency: front-heavy (longer setup) (ratio: 1.24)
  Distribution: {'Front-Heavy': np.int64(120), 'Balanced': np.int64(76), 'Back-Heavy': np.int64(4)}



Token distribution analysis completed successfully


In [None]:
def create_comprehensive_heatmap_analysis(linguistic_data):

    print("\nCREATING COMPREHENSIVE HEATMAP ANALYSIS")
    print("-" * 41)

    # Prepare linguistic features for heatmap analysis
    feature_columns = [
        'Pre_Comparator_Tokens', 'Post_Comparator_Tokens', 'Pre_Post_Ratio',
        'Total_Tokens', 'Sentiment_Polarity', 'Sentiment_Subjectivity',
        'Syntactic_Complexity'
    ]

    # Create working dataset with valid numeric data
    heatmap_dataset = linguistic_data[
        ['Dataset', 'Category_Framework'] + feature_columns
    ].copy()

    # Remove rows with missing critical data
    analysis_data = heatmap_dataset.dropna(subset=feature_columns[:4])  # Require core metrics

    print(f"Prepared heatmap data: {len(analysis_data)} complete records")

    if len(analysis_data) == 0:
        print("Warning: Insufficient data for heatmap analysis")
        return None, None

    # Calculate aggregated metrics by dataset and category
    aggregated_metrics = analysis_data.groupby(['Dataset', 'Category_Framework']).agg({
        col: ['mean', 'std', 'count'] for col in feature_columns
    }).round(3)

    # Flatten column names for easier handling
    aggregated_metrics.columns = [f"{col[0]}_{col[1]}" for col in aggregated_metrics.columns]
    category_combinations = aggregated_metrics.reset_index()

    # Create comprehensive heatmap visualization
    heatmap_figure = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Feature Correlation Matrix (All Datasets)',
            'Dataset-Category Feature Intensity',
            'Joyce vs BNC Pattern Comparison',
            'Feature Variance Analysis'
        ),
        specs=[[{"type": "heatmap"}, {"type": "heatmap"}],
               [{"type": "heatmap"}, {"type": "heatmap"}]]
    )

    # 1. Overall feature correlation matrix
    correlation_data = analysis_data[feature_columns].corr()

    heatmap_figure.add_trace(go.Heatmap(
        z=correlation_data.values,
        x=[col.replace('_', ' ').title() for col in correlation_data.columns],
        y=[col.replace('_', ' ').title() for col in correlation_data.index],
        colorscale='RdBu_r',
        zmid=0,
        text=correlation_data.round(2).values,
        texttemplate="%{text}",
        textfont={"size": 10},
        hovertemplate="<b>%{y} vs %{x}</b><br>" +
                     "Correlation: %{z:.3f}<br>" +
                     "<extra></extra>",
        colorbar=dict(title="Correlation", x=0.48, len=0.4)
    ), row=1, col=1)

    # 2. Dataset-Category feature intensity heatmap
    # Create matrix for mean values
    intensity_matrix = []
    row_labels = []

    for _, combo_row in category_combinations.iterrows():
        dataset_name = combo_row['Dataset']
        category_name = combo_row['Category_Framework']

        # Extract mean values for each feature
        feature_means = []
        for feature in feature_columns:
            mean_col = f"{feature}_mean"
            if mean_col in combo_row and not pd.isna(combo_row[mean_col]):
                feature_means.append(combo_row[mean_col])
            else:
                feature_means.append(0)

        intensity_matrix.append(feature_means)
        row_labels.append(f"{dataset_name}_{category_name}")

    if intensity_matrix:
        intensity_array = np.array(intensity_matrix)

        # Normalize each column (feature) separately for better comparison
        feature_scaler = StandardScaler()
        normalized_intensity = feature_scaler.fit_transform(intensity_array)

        heatmap_figure.add_trace(go.Heatmap(
            z=normalized_intensity,
            x=[col.replace('_', ' ').title() for col in feature_columns],
            y=row_labels,
            colorscale='Viridis',
            text=intensity_array.round(2),
            texttemplate="%{text}",
            textfont={"size": 8},
            hovertemplate="<b>%{y}</b><br>" +
                         "%{x}: %{text}<br>" +
                         "Normalized: %{z:.2f}<br>" +
                         "<extra></extra>",
            colorbar=dict(title="Normalized Intensity", x=1.02, len=0.4)
        ), row=1, col=2)

    # 3. Joyce vs BNC comparison matrix
    joyce_patterns = analysis_data[analysis_data['Dataset'].isin(['manual', 'computational'])]
    bnc_patterns = analysis_data[analysis_data['Dataset'] == 'bnc']

    if len(joyce_patterns) > 0 and len(bnc_patterns) > 0:
        # Calculate mean differences (Joyce - BNC)
        joyce_means = joyce_patterns[feature_columns].mean()
        bnc_means = bnc_patterns[feature_columns].mean()
        difference_vector = joyce_means - bnc_means

        # Create comparison matrix (reshape for heatmap)
        comparison_matrix = difference_vector.values.reshape(1, -1)

        heatmap_figure.add_trace(go.Heatmap(
            z=comparison_matrix,
            x=[col.replace('_', ' ').title() for col in feature_columns],
            y=['Joyce - BNC Difference'],
            colorscale='RdBu_r',
            zmid=0,
            text=comparison_matrix.round(2),
            texttemplate="%{text}",
            textfont={"size": 11},
            hovertemplate="<b>%{x}</b><br>" +
                         "Joyce avg: %{customdata[0]:.2f}<br>" +
                         "BNC avg: %{customdata[1]:.2f}<br>" +
                         "Difference: %{z:.2f}<br>" +
                         "<extra></extra>",
            customdata=np.array([joyce_means.values, bnc_means.values]).T.reshape(1, -1, 2),
            colorbar=dict(title="Difference", x=0.48, y=0.15, len=0.3)
        ), row=2, col=1)

    # 4. Feature variance analysis by dataset
    variance_data = []
    dataset_list = ['manual', 'computational', 'bnc']

    for dataset_name in dataset_list:
        dataset_subset = analysis_data[analysis_data['Dataset'] == dataset_name]
        if len(dataset_subset) > 0:
            dataset_variances = dataset_subset[feature_columns].var()
            variance_data.append(dataset_variances.values)
        else:
            variance_data.append([0] * len(feature_columns))

    if variance_data:
        variance_matrix = np.array(variance_data)

        heatmap_figure.add_trace(go.Heatmap(
            z=variance_matrix,
            x=[col.replace('_', ' ').title() for col in feature_columns],
            y=[name.title() for name in dataset_list],
            colorscale='Oranges',
            text=variance_matrix.round(2),
            texttemplate="%{text}",
            textfont={"size": 10},
            hovertemplate="<b>%{y} Dataset</b><br>" +
                         "%{x} Variance: %{z:.2f}<br>" +
                         "Interpretation: %{customdata}<br>" +
                         "<extra></extra>",
            customdata=[['High Variability' if v > 10 else 'Low Variability' for v in row] for row in variance_matrix],
            colorbar=dict(title="Variance", x=1.02, y=0.15, len=0.3)
        ), row=2, col=2)

    # Configure layout with proper spacing
    heatmap_figure.update_layout(
        title={
            'text': "Multi-Dimensional Heatmap: Linguistic Pattern Analysis Across Joyce & BNC",
            'x': 0.5,
            'font': {'size': 16}
        },
        height=1000,
        showlegend=False,
        font=dict(size=10)
    )

    # Generate interpretive analysis
    print("\nHeatmap Analysis Interpretation:")

    # Correlation insights
    strongest_correlations = []
    for i in range(len(correlation_data.columns)):
        for j in range(i+1, len(correlation_data.columns)):
            corr_value = correlation_data.iloc[i, j]
            if abs(corr_value) > 0.5:  # Strong correlation threshold
                feature_pair = (correlation_data.columns[i], correlation_data.columns[j])
                strongest_correlations.append((feature_pair, corr_value))

    if strongest_correlations:
        print("\nStrongest Feature Correlations:")
        for (feat1, feat2), correlation in sorted(strongest_correlations, key=lambda x: abs(x[1]), reverse=True)[:3]:
            direction = "positive" if correlation > 0 else "negative"
            print(f"  {feat1} ↔ {feat2}: {correlation:.3f} ({direction})")

    # Joyce vs BNC differences
    if len(joyce_patterns) > 0 and len(bnc_patterns) > 0:
        print(f"\nJoyce vs BNC Pattern Differences:")
        for feature in feature_columns:
            joyce_avg = joyce_means[feature]
            bnc_avg = bnc_means[feature]
            difference = joyce_avg - bnc_avg

            if abs(difference) > 0.5:  # Meaningful difference threshold
                direction = "higher" if difference > 0 else "lower"
                print(f"  {feature}: Joyce {direction} by {abs(difference):.2f}")

    heatmap_figure.write_html("joyce_comprehensive_heatmap.html")
    print("HTML file saved: joyce_comprehensive_heatmap.html")
    heatmap_figure.show()

    print("\nMulti-dimensional heatmap analysis completed successfully")
    return heatmap_figure, category_combinations

# Execute comprehensive heatmap analysis
heatmap_results, pattern_metrics = create_comprehensive_heatmap_analysis(df)


CREATING COMPREHENSIVE HEATMAP ANALYSIS
-----------------------------------------
Prepared heatmap data: 601 complete records

Heatmap Analysis Interpretation:

Strongest Feature Correlations:
  Post_Comparator_Tokens ↔ Total_Tokens: 0.849 (positive)
  Pre_Comparator_Tokens ↔ Total_Tokens: 0.632 (positive)
  Total_Tokens ↔ Syntactic_Complexity: 0.562 (positive)

Joyce vs BNC Pattern Differences:
  Pre_Comparator_Tokens: Joyce higher by 0.54
  Post_Comparator_Tokens: Joyce higher by 2.20
  Pre_Post_Ratio: Joyce higher by 1.06
  Total_Tokens: Joyce higher by 2.81
HTML file saved: joyce_comprehensive_heatmap.html



Multi-dimensional heatmap analysis completed successfully


In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

# Load the dataset
corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
print(f"Dataset loaded: {len(corpus_dataset)} records")

def create_correlation_heatmap(data):
    """
    Creates a clean correlation matrix heatmap for linguistic features.
    """
    print("Creating correlation matrix heatmap...")

    # Define the linguistic features to analyze
    feature_columns = [
        'Pre_Comparator_Tokens', 'Post_Comparator_Tokens', 'Pre_Post_Ratio',
        'Total_Tokens', 'Sentiment_Polarity', 'Sentiment_Subjectivity',
        'Syntactic_Complexity'
    ]

    # Create shorter, cleaner labels
    feature_labels = [
        'Pre-Comp\nTokens', 'Post-Comp\nTokens', 'Token\nRatio',
        'Total\nLength', 'Sentiment\nPolarity', 'Sentiment\nSubjectivity',
        'Syntactic\nComplexity'
    ]

    # Filter data to only include rows with complete feature data
    clean_data = data[feature_columns].dropna()
    print(f"Using {len(clean_data)} complete records for correlation analysis")

    if len(clean_data) < 2:
        print("Error: Need at least 2 complete records for correlation")
        return None

    # Calculate correlation matrix
    correlation_matrix = clean_data.corr()

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=correlation_matrix.values,
        x=feature_labels,
        y=feature_labels,
        colorscale='RdBu_r',
        zmid=0,
        zmin=-1,
        zmax=1,
        text=np.round(correlation_matrix.values, 3),
        texttemplate="%{text}",
        textfont={"size": 12, "color": "black"},
        hovertemplate="<b>%{y} × %{x}</b><br>" +
                     "Correlation: %{z:.3f}<br>" +
                     "<extra></extra>",
        colorbar=dict(
            title="Correlation<br>Coefficient",
            titleside="right",
            thickness=20,
            len=0.8
        )
    ))

    # Configure layout
    fig.update_layout(
        title={
            'text': "Feature Correlation Matrix: Linguistic Analysis",
            'x': 0.5,
            'font': {'size': 18, 'family': 'Arial'}
        },
        width=800,
        height=700,
        font=dict(size=12, family='Arial'),
        margin=dict(l=80, r=120, t=80, b=80),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    # Style the axes
    fig.update_xaxes(tickangle=0, side='bottom')
    fig.update_yaxes(tickangle=0)

    return fig, correlation_matrix

# Generate the correlation heatmap
correlation_fig, corr_matrix = create_correlation_heatmap(corpus_dataset)

if correlation_fig is not None:
    # Save the visualization
    correlation_fig.write_html("correlation_heatmap.html")
    print("✓ Correlation heatmap saved as: correlation_heatmap.html")

    # Show the plot
    correlation_fig.show()

    # Print key findings
    print("\nKey Correlation Findings:")
    print("-" * 30)

    # Find strongest correlations (excluding diagonal)
    correlations = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            feat1 = corr_matrix.columns[i]
            feat2 = corr_matrix.columns[j]
            correlations.append((feat1, feat2, corr_val))

    # Sort by absolute correlation value
    correlations.sort(key=lambda x: abs(x[2]), reverse=True)

    # Show top 5 correlations
    for feat1, feat2, corr_val in correlations[:5]:
        direction = "positive" if corr_val > 0 else "negative"
        strength = "strong" if abs(corr_val) > 0.7 else "moderate" if abs(corr_val) > 0.4 else "weak"
        print(f"• {feat1} ↔ {feat2}: r = {corr_val:.3f} ({strength} {direction})")

    print(f"\nAnalysis complete. HTML file ready for download.")

else:
    print("Failed to generate correlation heatmap.")

Dataset loaded: 612 records
Creating correlation matrix heatmap...
Using 602 complete records for correlation analysis
✓ Correlation heatmap saved as: correlation_heatmap.html



Key Correlation Findings:
------------------------------
• Post_Comparator_Tokens ↔ Total_Tokens: r = 0.849 (strong positive)
• Pre_Comparator_Tokens ↔ Total_Tokens: r = 0.632 (moderate positive)
• Total_Tokens ↔ Syntactic_Complexity: r = 0.562 (moderate positive)
• Post_Comparator_Tokens ↔ Syntactic_Complexity: r = 0.449 (moderate positive)
• Pre_Comparator_Tokens ↔ Syntactic_Complexity: r = 0.395 (weak positive)

Analysis complete. HTML file ready for download.


In [None]:
def generate_linguistic_heatmap_visualization(corpus_data):
    """
    Creates multi-panel heatmap visualization to explore relationships between
    linguistic features across different text corpora and categorization schemes.

    Returns interactive plotly figure with correlation analysis, feature intensity
    mapping, comparative analysis, and variance exploration.
    """
    print("\nGenerating comprehensive linguistic feature heatmaps...")
    print("=" * 55)

    # Define core linguistic measurement variables
    linguistic_features = [
        'Pre_Comparator_Tokens', 'Post_Comparator_Tokens', 'Pre_Post_Ratio',
        'Total_Tokens', 'Sentiment_Polarity', 'Sentiment_Subjectivity',
        'Syntactic_Complexity'
    ]

    # Create readable display labels for visualization
    display_labels = [
        'Pre-Comp Tokens', 'Post-Comp Tokens', 'Token Ratio',
        'Total Length', 'Sentiment Pol.', 'Sentiment Sub.',
        'Syntax Score'
    ]

    # Prepare clean dataset for analysis - map to correct column names
    # Use 'Dataset_Source' or 'Original_Dataset' as the dataset identifier
    dataset_col = 'Dataset_Source' if 'Dataset_Source' in corpus_data.columns else 'Original_Dataset'

    analysis_df = corpus_data[
        [dataset_col, 'Category_Framework'] + linguistic_features
    ].copy()

    # Rename the dataset column for consistency in the rest of the code
    analysis_df = analysis_df.rename(columns={dataset_col: 'Dataset'})

    # Filter out incomplete records - keep only rows with core measurements
    complete_records = analysis_df.dropna(subset=linguistic_features[:4])

    print(f"Processing {len(complete_records)} complete linguistic records")

    if len(complete_records) == 0:
        print("Error: No complete records found for heatmap generation")
        return None, None

    # Calculate summary statistics by corpus and category
    grouped_stats = complete_records.groupby(['Dataset', 'Category_Framework']).agg({
        feature: ['mean', 'std', 'count'] for feature in linguistic_features
    }).round(3)

    # Flatten the multi-level column structure
    grouped_stats.columns = [f"{metric}_{stat}" for metric, stat in grouped_stats.columns]
    summary_table = grouped_stats.reset_index()

    # Initialize multi-panel figure with appropriate subplot configuration
    multi_panel_fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Inter-Feature Correlation Matrix',
            'Corpus-Category Feature Intensity',
            'Literary vs Reference Corpus Contrast',
            'Feature Variability by Corpus Type'
        ),
        specs=[[{"type": "heatmap"}, {"type": "heatmap"}],
               [{"type": "heatmap"}, {"type": "heatmap"}]],
        horizontal_spacing=0.18,
        vertical_spacing=0.15
    )

    # Panel 1: Calculate and display feature correlation matrix
    correlation_matrix = complete_records[linguistic_features].corr()

    multi_panel_fig.add_trace(go.Heatmap(
        z=correlation_matrix.values,
        x=display_labels,
        y=display_labels,
        colorscale='RdBu_r',
        zmid=0,
        text=correlation_matrix.round(3).values,
        texttemplate="%{text}",
        textfont={"size": 9},
        hovertemplate="<b>%{y} × %{x}</b><br>" +
                     "Correlation: %{z:.3f}<br>" +
                     "<extra></extra>",
        colorbar=dict(
            title="r-value",
            x=0.40,
            y=0.78,
            len=0.35,
            thickness=12
        )
    ), row=1, col=1)

    # Panel 2: Feature intensity across corpus-category combinations
    intensity_data = []
    combination_labels = []

    for _, record in summary_table.iterrows():
        corpus_name = record['Dataset'].title()[:5]  # Truncate for display
        category_type = record['Category_Framework']

        # Shorten long category names for better display
        if len(category_type) > 12:
            category_type = category_type[:9] + "..."

        # Extract mean values for each linguistic feature
        mean_values = []
        for feature in linguistic_features:
            mean_column = f"{feature}_mean"
            if mean_column in record and not pd.isna(record[mean_column]):
                mean_values.append(record[mean_column])
            else:
                mean_values.append(0)

        intensity_data.append(mean_values)
        combination_labels.append(f"{corpus_name}_{category_type}")

    if intensity_data:
        raw_intensity = np.array(intensity_data)

        # Apply z-score normalization for cross-feature comparison
        from sklearn.preprocessing import StandardScaler
        normalizer = StandardScaler()
        normalized_intensity = normalizer.fit_transform(raw_intensity)

        multi_panel_fig.add_trace(go.Heatmap(
            z=normalized_intensity,
            x=display_labels,
            y=combination_labels,
            colorscale='Plasma',
            text=raw_intensity.round(2),
            texttemplate="%{text}",
            textfont={"size": 6},
            hovertemplate="<b>%{y}</b><br>" +
                         "%{x}: %{text}<br>" +
                         "Z-score: %{z:.2f}<br>" +
                         "<extra></extra>",
            colorbar=dict(
                title="Z-score",
                x=1.05,
                y=0.78,
                len=0.35,
                thickness=12
            )
        ), row=1, col=2)

    # Panel 3: Direct comparison between literary and reference corpora
    literary_corpus = complete_records[complete_records['Dataset'].isin(['manual', 'computational'])]
    reference_corpus = complete_records[complete_records['Dataset'] == 'bnc']

    if len(literary_corpus) > 0 and len(reference_corpus) > 0:
        # Calculate mean differences between corpus types
        literary_means = literary_corpus[linguistic_features].mean()
        reference_means = reference_corpus[linguistic_features].mean()
        difference_scores = literary_means - reference_means

        # Reshape for heatmap display
        comparison_array = difference_scores.values.reshape(1, -1)

        multi_panel_fig.add_trace(go.Heatmap(
            z=comparison_array,
            x=display_labels,
            y=['Literary - Reference'],
            colorscale='RdBu_r',
            zmid=0,
            text=comparison_array.round(2),
            texttemplate="%{text}",
            textfont={"size": 11},
            hovertemplate="<b>%{x}</b><br>" +
                         "Literary mean: %{customdata[0]:.2f}<br>" +
                         "Reference mean: %{customdata[1]:.2f}<br>" +
                         "Difference: %{z:.2f}<br>" +
                         "<extra></extra>",
            customdata=np.array([literary_means.values, reference_means.values]).T.reshape(1, -1, 2),
            colorbar=dict(
                title="Δ Score",
                x=0.40,
                y=0.22,
                len=0.35,
                thickness=12
            )
        ), row=2, col=1)

    # Panel 4: Variance analysis across different corpus types
    variance_matrix = []
    corpus_types = ['Manual', 'Computational', 'Reference']
    corpus_keys = ['manual', 'computational', 'bnc']

    for corpus_key in corpus_keys:
        corpus_subset = complete_records[complete_records['Dataset'] == corpus_key]
        if len(corpus_subset) > 0:
            feature_variances = corpus_subset[linguistic_features].var()
            variance_matrix.append(feature_variances.values)
        else:
            variance_matrix.append([0] * len(linguistic_features))

    if variance_matrix:
        variance_array = np.array(variance_matrix)

        multi_panel_fig.add_trace(go.Heatmap(
            z=variance_array,
            x=display_labels,
            y=corpus_types,
            colorscale='YlOrRd',
            text=variance_array.round(2),
            texttemplate="%{text}",
            textfont={"size": 10},
            hovertemplate="<b>%{y} Corpus</b><br>" +
                         "%{x} Variance: %{z:.2f}<br>" +
                         "Variability: %{customdata}<br>" +
                         "<extra></extra>",
            customdata=[['High' if v > 10 else 'Moderate' if v > 5 else 'Low' for v in row] for row in variance_array],
            colorbar=dict(
                title="Variance",
                x=1.05,
                y=0.22,
                len=0.35,
                thickness=12
            )
        ), row=2, col=2)

    # Configure overall figure layout and styling
    multi_panel_fig.update_layout(
        title={
            'text': "Linguistic Feature Analysis: Multi-Dimensional Heatmap Exploration",
            'x': 0.5,
            'font': {'size': 20, 'family': 'Times New Roman'},
            'y': 0.97
        },
        height=1200,
        width=1500,
        showlegend=False,
        font=dict(size=12, family='Times New Roman'),
        margin=dict(l=90, r=90, t=130, b=90),
        plot_bgcolor='#fafafa',
        paper_bgcolor='white'
    )

    # Style subplot titles with academic formatting
    multi_panel_fig.update_annotations(
        font_size=15,
        font_family="Times New Roman",
        font_color="#2c3e50"
    )

    # Configure axis properties to prevent label overlap
    for row_idx in range(1, 3):
        for col_idx in range(1, 3):
            multi_panel_fig.update_xaxes(
                tickangle=35,
                tickfont=dict(size=10),
                row=row_idx, col=col_idx
            )
            multi_panel_fig.update_yaxes(
                tickfont=dict(size=10),
                row=row_idx, col=col_idx
            )

    # Special formatting for the intensity panel with potentially long labels
    multi_panel_fig.update_yaxes(
        tickfont=dict(size=8),
        row=1, col=2
    )

    # Generate analytical insights from the visualization
    print("\nKey findings from heatmap analysis:")

    # Identify strongest correlations between features
    significant_correlations = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            correlation_value = correlation_matrix.iloc[i, j]
            if abs(correlation_value) > 0.4:  # Threshold for meaningful correlation
                feature_pair = (display_labels[i], display_labels[j])
                significant_correlations.append((feature_pair, correlation_value))

    if significant_correlations:
        print("\nNotable feature correlations identified:")
        for (feat_a, feat_b), corr_val in sorted(significant_correlations,
                                                 key=lambda x: abs(x[1]), reverse=True)[:4]:
            relationship = "positive" if corr_val > 0 else "negative"
            print(f"  • {feat_a} ↔ {feat_b}: r = {corr_val:.3f} ({relationship})")

    # Analyze differences between literary and reference texts
    if len(literary_corpus) > 0 and len(reference_corpus) > 0:
        print(f"\nDistinctive patterns in literary vs reference texts:")
        for i, feature in enumerate(linguistic_features):
            lit_mean = literary_means[feature]
            ref_mean = reference_means[feature]
            difference = lit_mean - ref_mean

            if abs(difference) > 0.8:  # Threshold for notable difference
                trend = "elevated" if difference > 0 else "reduced"
                print(f"  • {display_labels[i]}: {trend} in literary texts (Δ = {difference:.2f})")

    # Save visualization with descriptive filename
    output_filename = "linguistic_feature_heatmap_analysis.html"
    multi_panel_fig.write_html(output_filename)
    print(f"\nVisualization saved as: {output_filename}")
    multi_panel_fig.show()

    print("Multi-dimensional heatmap analysis completed.")
    return multi_panel_fig, summary_table

def create_single_feature_heatmap(data_matrix, feature_names, title_text, color_scheme='Viridis'):
    """
    Generates a focused heatmap for detailed examination of specific feature relationships.
    Useful for creating publication-ready figures with clean formatting.
    """
    if len(feature_names) != data_matrix.shape[1]:
        feature_names = [f"Feature_{i+1}" for i in range(data_matrix.shape[1])]

    focused_fig = go.Figure(data=go.Heatmap(
        z=data_matrix,
        x=feature_names,
        y=feature_names if data_matrix.shape[0] == data_matrix.shape[1] else [f"Obs_{i+1}" for i in range(data_matrix.shape[0])],
        colorscale=color_scheme,
        text=np.round(data_matrix, 3),
        texttemplate="%{text}",
        textfont={"size": 13},
        hovertemplate="<b>%{y} × %{x}</b><br>Value: %{z:.4f}<extra></extra>"
    ))

    focused_fig.update_layout(
        title={
            'text': title_text,
            'x': 0.5,
            'font': {'size': 18, 'family': 'Times New Roman'}
        },
        width=900,
        height=700,
        xaxis_tickangle=40,
        font=dict(size=13, family='Times New Roman'),
        margin=dict(l=120, r=120, t=120, b=120),
        plot_bgcolor='white'
    )

    return focused_fig

# Load and prepare the dataset
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler

# Load the linguistic analysis dataset
try:
    corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
    print(f"Dataset loaded successfully: {len(corpus_dataset)} records")
    print(f"Columns available: {list(corpus_dataset.columns)}")
except FileNotFoundError:
    print("ERROR: File '/comprehensive_linguistic_analysis.csv' not found")
    print("Please ensure the file is in the correct path")
    corpus_dataset = None

# Execute the heatmap analysis
if __name__ == "__main__" and corpus_dataset is not None:
    # Run the comprehensive heatmap analysis
    try:
        visualization_results, feature_summary = generate_linguistic_heatmap_visualization(corpus_dataset)

        if visualization_results is not None:
            print("\n" + "="*60)
            print("HEATMAP ANALYSIS COMPLETED SUCCESSFULLY")
            print("="*60)
            print(f"✓ Interactive visualization generated")
            print(f"✓ HTML file saved for download")
            print(f"✓ Feature summary table created")
            print(f"✓ Statistical analysis completed")

            # Display the summary statistics
            if feature_summary is not None and len(feature_summary) > 0:
                print(f"\nSummary: Analyzed {len(feature_summary)} corpus-category combinations")
                print("\nDataset distribution:")
                dataset_counts = feature_summary['Dataset'].value_counts()
                for dataset, count in dataset_counts.items():
                    print(f"  - {dataset.title()}: {count} combinations")
        else:
            print("ERROR: Failed to generate heatmap visualization")

    except Exception as e:
        print(f"ERROR: Analysis failed - {str(e)}")
        print("Please check that 'simile_dataset' variable exists and contains the required columns")

Dataset loaded successfully: 612 records
Columns available: ['ID', 'Story', 'Page_Number', 'Sentence_Context', 'Comparator_Type', 'Category_Framework', 'Additional Notes', 'CLAWS', 'Dataset_Source', 'Total_Tokens', 'Pre_Comparator_Tokens', 'Post_Comparator_Tokens', 'Pre_Post_Ratio', 'Lemmatized_Text', 'POS_Tags', 'POS_Distribution', 'Sentiment_Polarity', 'Sentiment_Subjectivity', 'Comparative_Structure', 'Syntactic_Complexity', 'Topic_Label', 'Original_Dataset', 'Page No.', 'Confidence_Score', 'Extraction_Method', 'Index', 'Left', 'Node', 'Right', 'Genre']

Generating comprehensive linguistic feature heatmaps...
Processing 602 complete linguistic records

Key findings from heatmap analysis:

Notable feature correlations identified:
  • Post-Comp Tokens ↔ Total Length: r = 0.849 (positive)
  • Pre-Comp Tokens ↔ Total Length: r = 0.632 (positive)
  • Total Length ↔ Syntax Score: r = 0.562 (positive)
  • Post-Comp Tokens ↔ Syntax Score: r = 0.449 (positive)

Visualization saved as: lingui

Multi-dimensional heatmap analysis completed.

HEATMAP ANALYSIS COMPLETED SUCCESSFULLY
✓ Interactive visualization generated
✓ HTML file saved for download
✓ Feature summary table created
✓ Statistical analysis completed

Summary: Analyzed 11 corpus-category combinations

Dataset distribution:
  - Computational_Extraction: 5 combinations
  - Manual_Annotation: 5 combinations
  - Bnc_Baseline: 1 combinations


In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Load the dataset
corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
print(f"Dataset loaded: {len(corpus_dataset)} records")

def create_simple_comparison_heatmap(data):
    """
    Creates a simple, interpretable comparison of datasets using percentage differences.
    """
    print("Creating simple dataset comparison...")

    # Focus on meaningful features that are on similar scales
    feature_columns = [
        'Pre_Comparator_Tokens', 'Post_Comparator_Tokens', 'Pre_Post_Ratio',
        'Sentiment_Polarity', 'Sentiment_Subjectivity', 'Syntactic_Complexity'
    ]

    feature_labels = [
        'Pre-Comparator\nTokens', 'Post-Comparator\nTokens', 'Pre/Post\nRatio',
        'Sentiment\nPolarity', 'Sentiment\nSubjectivity', 'Syntactic\nComplexity'
    ]

    # Use the correct dataset column
    dataset_col = 'Dataset_Source' if 'Dataset_Source' in data.columns else 'Original_Dataset'

    # Clean the data
    analysis_data = data[[dataset_col] + feature_columns].dropna()
    print(f"Using {len(analysis_data)} complete records")

    # Calculate means by dataset
    dataset_means = analysis_data.groupby(dataset_col)[feature_columns].mean()

    # Create simple dataset labels
    dataset_labels = []
    for dataset in dataset_means.index:
        if 'manual' in str(dataset).lower():
            dataset_labels.append('Manual')
        elif 'computational' in str(dataset).lower():
            dataset_labels.append('Computational')
        elif 'bnc' in str(dataset).lower():
            dataset_labels.append('BNC Reference')
        else:
            dataset_labels.append(str(dataset)[:15])

    # Calculate percentage differences from overall mean
    overall_means = analysis_data[feature_columns].mean()
    percentage_diff = ((dataset_means - overall_means) / overall_means * 100)

    # Create the heatmap with percentage differences
    fig = go.Figure(data=go.Heatmap(
        z=percentage_diff.values,
        x=feature_labels,
        y=dataset_labels,
        colorscale='RdBu_r',
        zmid=0,
        zmin=-50,
        zmax=50,
        text=np.round(percentage_diff.values, 1),
        texttemplate="%{text}%",
        textfont={"size": 11, "color": "black"},
        hovertemplate="<b>%{y}</b><br>" +
                     "%{x}<br>" +
                     "Difference from average: %{z:.1f}%<br>" +
                     "<extra></extra>",
        colorbar=dict(
            title="% Difference<br>from Average",
            titleside="right",
            thickness=20,
            len=0.8,
            ticksuffix="%"
        )
    ))

    # Configure layout
    fig.update_layout(
        title={
            'text': "Dataset Comparison: How Each Source Differs from Average",
            'x': 0.5,
            'font': {'size': 16, 'family': 'Arial'}
        },
        width=900,
        height=500,
        font=dict(size=12, family='Arial'),
        margin=dict(l=100, r=120, t=80, b=80),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    # Style the axes
    fig.update_xaxes(tickangle=0)
    fig.update_yaxes(tickangle=0)

    return fig, dataset_means, percentage_diff

# Generate the simple comparison heatmap
comparison_fig, raw_means, percent_diffs = create_simple_comparison_heatmap(corpus_dataset)

if comparison_fig is not None:
    # Save the visualization
    comparison_fig.write_html("simple_dataset_comparison.html")
    print("✓ Simple comparison heatmap saved as: simple_dataset_comparison.html")

    # Show the plot
    comparison_fig.show()

    # Print clear interpretation
    print("\nDataset Comparison Results:")
    print("=" * 35)
    print("(Showing % difference from overall average)")
    print()

    for i, dataset in enumerate(percent_diffs.index):
        print(f"{dataset.upper()}:")
        dataset_row = percent_diffs.iloc[i]

        # Show biggest differences
        biggest_positive = dataset_row.idxmax()
        biggest_negative = dataset_row.idxmin()

        print(f"  • Highest feature: {biggest_positive} (+{dataset_row[biggest_positive]:.1f}%)")
        print(f"  • Lowest feature: {biggest_negative} ({dataset_row[biggest_negative]:.1f}%)")

        # Show any notable patterns
        high_features = dataset_row[dataset_row > 10].index.tolist()
        low_features = dataset_row[dataset_row < -10].index.tolist()

        if high_features:
            print(f"  • Well above average: {', '.join(high_features)}")
        if low_features:
            print(f"  • Well below average: {', '.join(low_features)}")
        print()

    print("Interpretation Guide:")
    print("• Red = Above average for that feature")
    print("• Blue = Below average for that feature")
    print("• White = Close to average")

else:
    print("Failed to generate comparison heatmap.")

Dataset loaded: 612 records
Creating simple dataset comparison...
Using 602 complete records
✓ Simple comparison heatmap saved as: simple_dataset_comparison.html



Dataset Comparison Results:
(Showing % difference from overall average)

BNC_BASELINE:
  • Highest feature: Sentiment_Polarity (+101.2%)
  • Lowest feature: Pre_Post_Ratio (-36.2%)
  • Well above average: Sentiment_Polarity
  • Well below average: Post_Comparator_Tokens, Pre_Post_Ratio

COMPUTATIONAL_EXTRACTION:
  • Highest feature: Pre_Post_Ratio (+11.0%)
  • Lowest feature: Sentiment_Polarity (-41.7%)
  • Well above average: Pre_Post_Ratio
  • Well below average: Sentiment_Polarity

MANUAL_ANNOTATION:
  • Highest feature: Pre_Post_Ratio (+26.4%)
  • Lowest feature: Sentiment_Polarity (-60.6%)
  • Well above average: Post_Comparator_Tokens, Pre_Post_Ratio
  • Well below average: Sentiment_Polarity

Interpretation Guide:
• Red = Above average for that feature
• Blue = Below average for that feature
• White = Close to average


In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

# Load the dataset
corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
print(f"Dataset loaded: {len(corpus_dataset)} records")

def create_simile_distribution_heatmap(data):
    """
    Creates a heatmap showing the distribution of similes across pages and stories in Dubliners.
    """
    print("Creating simile distribution heatmap...")

    # Clean up page numbers - handle different column names
    page_col = None
    for col in ['Page_Number', 'Page No.', 'Page_No', 'page_number']:
        if col in data.columns:
            page_col = col
            break

    if page_col is None:
        print("Error: No page number column found")
        return None

    # Clean the data
    clean_data = data[['Story', page_col]].copy()

    # Convert page numbers to numeric, handling any text
    clean_data['Page_Numeric'] = pd.to_numeric(clean_data[page_col], errors='coerce')

    # Remove rows with missing page numbers or stories
    clean_data = clean_data.dropna(subset=['Story', 'Page_Numeric'])
    clean_data['Page_Numeric'] = clean_data['Page_Numeric'].astype(int)

    print(f"Analyzing {len(clean_data)} similes with valid page numbers")
    print(f"Page range: {clean_data['Page_Numeric'].min()} to {clean_data['Page_Numeric'].max()}")
    print(f"Stories: {clean_data['Story'].unique()}")

    # Count similes per page per story
    simile_counts = clean_data.groupby(['Story', 'Page_Numeric']).size().reset_index(name='Simile_Count')

    # Create a complete grid of all stories and all pages
    all_stories = sorted(clean_data['Story'].unique())
    min_page = clean_data['Page_Numeric'].min()
    max_page = clean_data['Page_Numeric'].max()
    all_pages = list(range(min_page, max_page + 1))

    # Create the full grid
    story_page_grid = []
    for story in all_stories:
        for page in all_pages:
            story_page_grid.append({'Story': story, 'Page_Numeric': page})

    full_grid = pd.DataFrame(story_page_grid)

    # Merge with actual counts (filling missing with 0)
    heatmap_data = full_grid.merge(simile_counts, on=['Story', 'Page_Numeric'], how='left')
    heatmap_data['Simile_Count'] = heatmap_data['Simile_Count'].fillna(0)

    # Pivot to create matrix for heatmap
    heatmap_matrix = heatmap_data.pivot(index='Story', columns='Page_Numeric', values='Simile_Count')

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_matrix.values,
        x=heatmap_matrix.columns,
        y=heatmap_matrix.index,
        colorscale='YlOrRd',
        text=heatmap_matrix.values.astype(int),
        texttemplate="%{text}",
        textfont={"size": 8},
        hovertemplate="<b>%{y}</b><br>" +
                     "Page %{x}<br>" +
                     "Similes: %{z}<br>" +
                     "<extra></extra>",
        colorbar=dict(
            title="Number of<br>Similes",
            titleside="right",
            thickness=20,
            len=0.8
        )
    ))

    # Configure layout
    fig.update_layout(
        title={
            'text': "Simile Distribution Across Dubliners: By Story and Page",
            'x': 0.5,
            'font': {'size': 16, 'family': 'Arial'}
        },
        width=1200,
        height=600,
        font=dict(size=10, family='Arial'),
        margin=dict(l=150, r=120, t=80, b=80),
        plot_bgcolor='white',
        paper_bgcolor='white',
        xaxis_title="Page Number",
        yaxis_title="Story"
    )

    # Style the axes
    fig.update_xaxes(tickangle=0, dtick=5)  # Show every 5th page
    fig.update_yaxes(tickangle=0)

    return fig, heatmap_data, simile_counts

def create_story_density_chart(simile_counts):
    """
    Creates a supplementary chart showing simile density by story.
    """
    story_totals = simile_counts.groupby('Story')['Simile_Count'].sum().reset_index()
    story_totals = story_totals.sort_values('Simile_Count', ascending=False)

    fig = px.bar(
        story_totals,
        x='Simile_Count',
        y='Story',
        orientation='h',
        title="Total Similes by Story",
        labels={'Simile_Count': 'Number of Similes', 'Story': 'Story'},
        color='Simile_Count',
        color_continuous_scale='YlOrRd'
    )

    fig.update_layout(
        height=400,
        width=800,
        font=dict(size=11, family='Arial'),
        showlegend=False
    )

    return fig, story_totals

# Generate the distribution heatmap
distribution_fig, grid_data, count_data = create_simile_distribution_heatmap(corpus_dataset)

if distribution_fig is not None:
    # Save the main heatmap
    distribution_fig.write_html("simile_distribution_heatmap.html")
    print("✓ Distribution heatmap saved as: simile_distribution_heatmap.html")

    # Show the plot
    distribution_fig.show()

    # Create supplementary density chart
    density_fig, story_stats = create_story_density_chart(count_data)
    density_fig.write_html("story_simile_density.html")
    print("✓ Story density chart saved as: story_simile_density.html")
    density_fig.show()

    # Print analysis
    print("\nSimile Distribution Analysis:")
    print("=" * 35)

    # Overall statistics
    total_similes = count_data['Simile_Count'].sum()
    pages_with_similes = len(count_data[count_data['Simile_Count'] > 0])
    max_similes_page = count_data.loc[count_data['Simile_Count'].idxmax()]

    print(f"Total similes analyzed: {total_similes}")
    print(f"Pages with similes: {pages_with_similes}")
    print(f"Highest concentration: {max_similes_page['Simile_Count']} similes on page {max_similes_page['Page_Numeric']} ({max_similes_page['Story']})")

    # Story rankings
    print(f"\nStory Rankings (by total similes):")
    for i, row in story_stats.iterrows():
        print(f"  {i+1}. {row['Story']}: {row['Simile_Count']} similes")

    # Page clustering analysis
    high_density_pages = count_data[count_data['Simile_Count'] >= 3]
    if len(high_density_pages) > 0:
        print(f"\nHigh-density pages (3+ similes):")
        for _, page in high_density_pages.iterrows():
            print(f"  Page {page['Page_Numeric']} ({page['Story']}): {page['Simile_Count']} similes")

    print(f"\nHTML files ready for download and further analysis.")

else:
    print("Failed to generate distribution heatmap.")

Dataset loaded: 612 records
Creating simile distribution heatmap...
Analyzing 172 similes with valid page numbers
Page range: 7 to 256
Stories: ['The Sisters' 'An Encounter' 'Araby' 'Eveline' 'After The Race'
 'Two Gallants' 'The Boarding House' 'A Little Cloud' 'Counterparts'
 'Clay' 'A Painful Case' 'Ive Day In The Committee Room' 'A Mother'
 'Grace' 'The Dead']
✓ Distribution heatmap saved as: simile_distribution_heatmap.html


✓ Story density chart saved as: story_simile_density.html



Simile Distribution Analysis:
Total similes analyzed: 172
Pages with similes: 126
Highest concentration: 4 similes on page 229 (The Dead)

Story Rankings (by total similes):
  13. The Dead: 45 similes
  10. Grace: 19 similes
  1. A Little Cloud: 18 similes
  14. The Sisters: 17 similes
  11. Ive Day In The Committee Room: 11 similes
  15. Two Gallants: 11 similes
  5. An Encounter: 10 similes
  8. Counterparts: 9 similes
  2. A Mother: 6 similes
  9. Eveline: 6 similes
  3. A Painful Case: 5 similes
  6. Araby: 5 similes
  12. The Boarding House: 4 similes
  7. Clay: 3 similes
  4. After The Race: 3 similes

High-density pages (3+ similes):
  Page 83 (A Little Cloud): 3 similes
  Page 25 (An Encounter): 3 similes
  Page 98 (Counterparts): 3 similes
  Page 70 (The Boarding House): 3 similes
  Page 229 (The Dead): 4 similes
  Page 244 (The Dead): 3 similes
  Page 252 (The Dead): 3 similes
  Page 7 (The Sisters): 3 similes
  Page 17 (The Sisters): 3 similes
  Page 60 (Two Gallants): 3 si

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Load the dataset
corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
print(f"Dataset loaded: {len(corpus_dataset)} records")

def create_feature_variability_heatmap(data):
    """
    Creates a heatmap showing how much each linguistic feature varies within each corpus type.
    Higher values indicate more variability/inconsistency within that corpus.
    """
    print("Creating feature variability heatmap...")

    # Define linguistic features to analyze
    feature_columns = [
        'Pre_Comparator_Tokens', 'Post_Comparator_Tokens', 'Pre_Post_Ratio',
        'Total_Tokens', 'Sentiment_Polarity', 'Sentiment_Subjectivity',
        'Syntactic_Complexity'
    ]

    feature_labels = [
        'Pre-Comp\nTokens', 'Post-Comp\nTokens', 'Token\nRatio',
        'Total\nTokens', 'Sentiment\nPolarity', 'Sentiment\nSubjectivity',
        'Syntactic\nComplexity'
    ]

    # Use the correct dataset column
    dataset_col = 'Dataset_Source' if 'Dataset_Source' in data.columns else 'Original_Dataset'

    # Clean the data
    analysis_data = data[[dataset_col] + feature_columns].dropna()
    print(f"Using {len(analysis_data)} complete records")

    # Calculate standard deviation (variability) for each feature by corpus
    corpus_variability = analysis_data.groupby(dataset_col)[feature_columns].std()

    # Create cleaner corpus labels
    corpus_labels = []
    for corpus in corpus_variability.index:
        if 'manual' in str(corpus).lower():
            corpus_labels.append('Manual\nExtraction')
        elif 'computational' in str(corpus).lower():
            corpus_labels.append('Computational\nExtraction')
        elif 'bnc' in str(corpus).lower():
            corpus_labels.append('BNC\nReference')
        else:
            corpus_labels.append(str(corpus)[:12])

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=corpus_variability.values,
        x=feature_labels,
        y=corpus_labels,
        colorscale='Reds',
        text=np.round(corpus_variability.values, 2),
        texttemplate="%{text}",
        textfont={"size": 10, "color": "black"},
        hovertemplate="<b>%{y}</b><br>" +
                     "%{x}<br>" +
                     "Standard Deviation: %{z:.3f}<br>" +
                     "Interpretation: %{customdata}<br>" +
                     "<extra></extra>",
        # Add interpretation in customdata
        customdata=np.where(corpus_variability.values > corpus_variability.values.mean(),
                           'High Variability', 'Low Variability'),
        colorbar=dict(
            title="Standard<br>Deviation<br>(Variability)",
            titleside="right",
            thickness=20,
            len=0.8
        )
    ))

    # Configure layout
    fig.update_layout(
        title={
            'text': "Feature Variability by Corpus Type: Internal Consistency Analysis",
            'x': 0.5,
            'font': {'size': 16, 'family': 'Arial'}
        },
        width=900,
        height=500,
        font=dict(size=12, family='Arial'),
        margin=dict(l=120, r=120, t=80, b=80),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    # Style the axes
    fig.update_xaxes(tickangle=0)
    fig.update_yaxes(tickangle=0)

    return fig, corpus_variability

# Generate the variability heatmap
variability_fig, variability_data = create_feature_variability_heatmap(corpus_dataset)

if variability_fig is not None:
    # Save the visualization
    variability_fig.write_html("feature_variability_heatmap.html")
    print("✓ Feature variability heatmap saved as: feature_variability_heatmap.html")

    # Show the plot
    variability_fig.show()

    # Print detailed analysis
    print("\nFeature Variability Analysis:")
    print("=" * 35)
    print("(Higher values = more inconsistent/variable within corpus)")
    print()

    for i, corpus in enumerate(variability_data.index):
        print(f"{corpus.upper()}:")
        corpus_row = variability_data.iloc[i]

        # Find most and least variable features
        most_variable = corpus_row.idxmax()
        least_variable = corpus_row.idxmin()

        print(f"  • Most variable: {most_variable} (SD = {corpus_row[most_variable]:.3f})")
        print(f"  • Most consistent: {least_variable} (SD = {corpus_row[least_variable]:.3f})")

        # Calculate overall variability score
        avg_variability = corpus_row.mean()
        print(f"  • Average variability: {avg_variability:.3f}")
        print()

    # Cross-corpus comparison
    print("Cross-Corpus Variability Comparison:")
    print("-" * 35)

    for feature in variability_data.columns:
        feature_variability = variability_data[feature]
        most_variable_corpus = feature_variability.idxmax()
        least_variable_corpus = feature_variability.idxmin()

        print(f"{feature}:")
        print(f"  Most variable in: {most_variable_corpus} ({feature_variability[most_variable_corpus]:.3f})")
        print(f"  Most consistent in: {least_variable_corpus} ({feature_variability[least_variable_corpus]:.3f})")
        print()

    print("Interpretation Guide:")
    print("• Dark red = High variability (inconsistent patterns)")
    print("• Light red = Low variability (consistent patterns)")
    print("• This shows internal consistency within each corpus type")

else:
    print("Failed to generate variability heatmap.")

Dataset loaded: 612 records
Creating feature variability heatmap...
Using 602 complete records
✓ Feature variability heatmap saved as: feature_variability_heatmap.html



Feature Variability Analysis:
(Higher values = more inconsistent/variable within corpus)

BNC_BASELINE:
  • Most variable: Post_Comparator_Tokens (SD = 1.756)
  • Most consistent: Pre_Post_Ratio (SD = 0.234)
  • Average variability: 0.927

COMPUTATIONAL_EXTRACTION:
  • Most variable: Total_Tokens (SD = 14.565)
  • Most consistent: Sentiment_Polarity (SD = 0.274)
  • Average variability: 6.073

MANUAL_ANNOTATION:
  • Most variable: Total_Tokens (SD = 21.554)
  • Most consistent: Sentiment_Polarity (SD = 0.256)
  • Average variability: 7.866

Cross-Corpus Variability Comparison:
-----------------------------------
Pre_Comparator_Tokens:
  Most variable in: Manual_Annotation (10.032)
  Most consistent in: BNC_Baseline (1.232)

Post_Comparator_Tokens:
  Most variable in: Manual_Annotation (16.562)
  Most consistent in: BNC_Baseline (1.756)

Pre_Post_Ratio:
  Most variable in: Manual_Annotation (4.219)
  Most consistent in: BNC_Baseline (0.234)

Total_Tokens:
  Most variable in: Manual_Ann

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

# Load the dataset
corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
print(f"Dataset loaded: {len(corpus_dataset)} records")

def create_category_intensity_heatmap(data):
    """
    Creates a heatmap showing feature intensity across different category frameworks.
    Shows how linguistic features vary by simile categorization type.
    """
    print("Creating category-feature intensity heatmap...")

    # Define linguistic features to analyze
    feature_columns = [
        'Pre_Comparator_Tokens', 'Post_Comparator_Tokens', 'Pre_Post_Ratio',
        'Sentiment_Polarity', 'Sentiment_Subjectivity', 'Syntactic_Complexity'
    ]

    feature_labels = [
        'Pre-Comp\nTokens', 'Post-Comp\nTokens', 'Token\nRatio',
        'Sentiment\nPolarity', 'Sentiment\nSubjectivity', 'Syntactic\nComplexity'
    ]

    # Clean the data
    analysis_data = data[['Category_Framework'] + feature_columns].dropna()
    print(f"Using {len(analysis_data)} complete records")

    # Check available categories
    available_categories = analysis_data['Category_Framework'].unique()
    print(f"Available categories: {list(available_categories)}")

    # Calculate mean values for each category
    category_means = analysis_data.groupby('Category_Framework')[feature_columns].mean()

    # Clean up category names for display
    clean_category_names = []
    for category in category_means.index:
        # Truncate long category names and clean up
        clean_name = str(category)
        if len(clean_name) > 20:
            clean_name = clean_name[:17] + "..."
        # Replace underscores with spaces
        clean_name = clean_name.replace('_', ' ').title()
        clean_category_names.append(clean_name)

    # Apply z-score normalization for better comparison across features
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(category_means.values)

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=normalized_data,
        x=feature_labels,
        y=clean_category_names,
        colorscale='RdYlBu_r',
        zmid=0,
        text=np.round(category_means.values, 2),  # Show original values
        texttemplate="%{text}",
        textfont={"size": 9, "color": "black"},
        hovertemplate="<b>%{y}</b><br>" +
                     "%{x}<br>" +
                     "Mean Value: %{text}<br>" +
                     "Z-score: %{z:.2f}<br>" +
                     "<extra></extra>",
        colorbar=dict(
            title="Normalized<br>Intensity<br>(Z-score)",
            titleside="right",
            thickness=20,
            len=0.8
        )
    ))

    # Configure layout
    fig.update_layout(
        title={
            'text': "Category Framework Feature Intensity: Linguistic Patterns by Simile Type",
            'x': 0.5,
            'font': {'size': 16, 'family': 'Arial'}
        },
        width=1000,
        height=max(400, len(clean_category_names) * 40),  # Dynamic height based on categories
        font=dict(size=11, family='Arial'),
        margin=dict(l=150, r=120, t=80, b=80),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )

    # Style the axes
    fig.update_xaxes(tickangle=0)
    fig.update_yaxes(tickangle=0)

    return fig, category_means, normalized_data

def create_category_comparison_chart(category_means):
    """
    Creates a supplementary chart showing the most distinctive features for each category.
    """
    # Calculate which feature is highest for each category
    distinctive_features = {}
    for category in category_means.index:
        category_row = category_means.loc[category]
        # Find the feature with highest z-score for this category
        max_feature = category_row.idxmax()
        max_value = category_row[max_feature]
        distinctive_features[category] = (max_feature, max_value)

    return distinctive_features

# Generate the category intensity heatmap
intensity_fig, category_data, normalized_matrix = create_category_intensity_heatmap(corpus_dataset)

if intensity_fig is not None:
    # Save the visualization
    intensity_fig.write_html("category_intensity_heatmap.html")
    print("✓ Category intensity heatmap saved as: category_intensity_heatmap.html")

    # Show the plot
    intensity_fig.show()

    # Generate distinctive features analysis
    distinctive_features = create_category_comparison_chart(category_data)

    # Print detailed analysis
    print("\nCategory Framework Analysis:")
    print("=" * 35)

    # Show statistics for each category
    for i, category in enumerate(category_data.index):
        print(f"\n{category.upper()}:")
        print("-" * 25)

        category_row = category_data.loc[category]

        # Show all feature values
        for feature, value in category_row.items():
            print(f"  {feature}: {value:.2f}")

        # Show distinctive feature
        if category in distinctive_features:
            feature, value = distinctive_features[category]
            print(f"  → Most prominent: {feature} ({value:.2f})")

    # Cross-category feature comparison
    print(f"\nFeature Prominence by Category:")
    print("-" * 35)

    for feature in category_data.columns:
        feature_values = category_data[feature]
        highest_category = feature_values.idxmax()
        lowest_category = feature_values.idxmin()

        print(f"\n{feature}:")
        print(f"  Highest in: {highest_category} ({feature_values[highest_category]:.2f})")
        print(f"  Lowest in: {lowest_category} ({feature_values[lowest_category]:.2f})")
        print(f"  Range: {feature_values.max() - feature_values.min():.2f}")

    # Overall patterns
    print(f"\nOverall Patterns:")
    print("-" * 20)

    # Calculate category diversity (how much each category differs from others)
    category_diversity = normalized_matrix.std(axis=1)
    most_distinctive_idx = category_diversity.argmax()
    most_similar_idx = category_diversity.argmin()

    print(f"Most distinctive category: {list(category_data.index)[most_distinctive_idx]}")
    print(f"Most similar to others: {list(category_data.index)[most_similar_idx]}")

    print(f"\nInterpretation Guide:")
    print("• Red = Above average for that feature")
    print("• Blue = Below average for that feature")
    print("• This shows which linguistic patterns characterize each simile category")

else:
    print("Failed to generate category intensity heatmap.")

Dataset loaded: 612 records
Creating category-feature intensity heatmap...
Using 601 complete records
Available categories: ['Joycean_Silent', 'Joycean_Framed', 'Joycean_Quasi', 'Standard', 'Joycean_Quasi_Fuzzy']
✓ Category intensity heatmap saved as: category_intensity_heatmap.html



Category Framework Analysis:

JOYCEAN_FRAMED:
-------------------------
  Pre_Comparator_Tokens: 23.27
  Post_Comparator_Tokens: 33.00
  Pre_Post_Ratio: 1.41
  Sentiment_Polarity: -0.08
  Sentiment_Subjectivity: 0.54
  Syntactic_Complexity: 6.91
  → Most prominent: Post_Comparator_Tokens (33.00)

JOYCEAN_QUASI:
-------------------------
  Pre_Comparator_Tokens: 11.62
  Post_Comparator_Tokens: 13.94
  Pre_Post_Ratio: 2.21
  Sentiment_Polarity: 0.00
  Sentiment_Subjectivity: 0.41
  Syntactic_Complexity: 5.97
  → Most prominent: Post_Comparator_Tokens (13.94)

JOYCEAN_QUASI_FUZZY:
-------------------------
  Pre_Comparator_Tokens: 11.04
  Post_Comparator_Tokens: 11.89
  Pre_Post_Ratio: 1.73
  Sentiment_Polarity: 0.01
  Sentiment_Subjectivity: 0.37
  Syntactic_Complexity: 5.37
  → Most prominent: Post_Comparator_Tokens (11.89)

JOYCEAN_SILENT:
-------------------------
  Pre_Comparator_Tokens: 13.44
  Post_Comparator_Tokens: 16.89
  Pre_Post_Ratio: 3.09
  Sentiment_Polarity: 0.12
  Sentim

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# Load the comprehensive dataset
corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
print(f"Dataset loaded: {len(corpus_dataset)} records")

def create_comprehensive_distribution_analysis(data):
    """
    Creates multiple distribution visualizations showing simile patterns across Dubliners.
    """
    print("Creating comprehensive simile distribution analysis...")

    # Clean and prepare the data
    # Handle page numbers from multiple possible columns
    page_col = None
    for col in ['Page_Number', 'Page No.', 'Page_No']:
        if col in data.columns and not data[col].isna().all():
            page_col = col
            break

    if page_col is None:
        print("Error: No valid page number column found")
        return None

    # Clean the analysis data
    analysis_data = data[['Story', page_col, 'Category_Framework', 'Dataset_Source', 'Comparator_Type']].copy()

    # Convert page numbers to numeric
    analysis_data['Page_Numeric'] = pd.to_numeric(analysis_data[page_col], errors='coerce')
    analysis_data = analysis_data.dropna(subset=['Story', 'Page_Numeric', 'Category_Framework'])

    print(f"Analyzing {len(analysis_data)} similes with complete data")
    print(f"Page range: {analysis_data['Page_Numeric'].min():.0f} to {analysis_data['Page_Numeric'].max():.0f}")
    print(f"Stories: {sorted(analysis_data['Story'].unique())}")
    print(f"Categories: {sorted(analysis_data['Category_Framework'].unique())}")

    return analysis_data

def create_story_page_heatmap(data):
    """
    Creates a heatmap showing simile distribution across stories and pages.
    """
    # Count similes per page per story
    page_story_counts = data.groupby(['Story', 'Page_Numeric']).size().reset_index(name='Simile_Count')

    # Create complete grid for all stories and pages
    stories = sorted(data['Story'].unique())
    min_page = int(data['Page_Numeric'].min())
    max_page = int(data['Page_Numeric'].max())

    # Create a more manageable page range (group by 5-page intervals)
    page_ranges = list(range(min_page, max_page + 1, 5))

    # Assign each page to a range
    data_with_ranges = data.copy()
    data_with_ranges['Page_Range'] = data_with_ranges['Page_Numeric'].apply(
        lambda x: f"{int(x//5)*5}-{int(x//5)*5+4}"
    )

    # Count similes by story and page range
    heatmap_data = data_with_ranges.groupby(['Story', 'Page_Range']).size().reset_index(name='Simile_Count')
    heatmap_matrix = heatmap_data.pivot(index='Story', columns='Page_Range', values='Simile_Count').fillna(0)

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_matrix.values,
        x=heatmap_matrix.columns,
        y=heatmap_matrix.index,
        colorscale='YlOrRd',
        text=heatmap_matrix.values.astype(int),
        texttemplate="%{text}",
        textfont={"size": 8},
        hovertemplate="<b>%{y}</b><br>" +
                     "Pages %{x}<br>" +
                     "Similes: %{z}<br>" +
                     "<extra></extra>",
        colorbar=dict(
            title="Similes per<br>Page Range",
            titleside="right",
            thickness=15,
            len=0.8
        )
    ))

    fig.update_layout(
        title="Simile Distribution: By Story and Page Range",
        width=1200,
        height=600,
        xaxis_title="Page Range",
        yaxis_title="Story",
        font=dict(size=11)
    )

    return fig, heatmap_matrix

def create_category_distribution_heatmap(data):
    """
    Creates a heatmap showing category distribution across stories.
    """
    # Count categories by story
    category_story = data.groupby(['Story', 'Category_Framework']).size().reset_index(name='Count')
    category_matrix = category_story.pivot(index='Story', columns='Category_Framework', values='Count').fillna(0)

    # Create the heatmap
    fig = go.Figure(data=go.Heatmap(
        z=category_matrix.values,
        x=category_matrix.columns,
        y=category_matrix.index,
        colorscale='Viridis',
        text=category_matrix.values.astype(int),
        texttemplate="%{text}",
        textfont={"size": 10},
        hovertemplate="<b>%{y}</b><br>" +
                     "%{x}<br>" +
                     "Count: %{z}<br>" +
                     "<extra></extra>",
        colorbar=dict(
            title="Simile<br>Count",
            titleside="right",
            thickness=15,
            len=0.8
        )
    ))

    fig.update_layout(
        title="Simile Categories: Distribution Across Stories",
        width=1000,
        height=600,
        xaxis_title="Category Framework",
        yaxis_title="Story",
        font=dict(size=11),
        xaxis_tickangle=45
    )

    return fig, category_matrix

def create_dataset_source_comparison(data):
    """
    Creates visualization comparing different dataset sources.
    """
    # Count by dataset source and story
    dataset_story = data.groupby(['Dataset_Source', 'Story']).size().reset_index(name='Count')

    # Create stacked bar chart
    fig = px.bar(
        dataset_story,
        x='Story',
        y='Count',
        color='Dataset_Source',
        title="Simile Detection by Source: Manual vs Computational vs BNC",
        labels={'Count': 'Number of Similes', 'Story': 'Dubliners Stories'},
        color_discrete_map={
            'MANUAL_ANNOTATION': '#e74c3c',
            'COMPUTATIONAL_EXTRACTION': '#3498db',
            'BNC_BASELINE': '#95a5a6'
        }
    )

    fig.update_layout(
        width=1200,
        height=500,
        xaxis_tickangle=45,
        font=dict(size=11),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )

    return fig

def create_simile_density_by_story(data):
    """
    Creates a density analysis showing simile concentration patterns.
    """
    # Calculate pages per story and simile density
    story_stats = []

    for story in data['Story'].unique():
        story_data = data[data['Story'] == story]

        min_page = story_data['Page_Numeric'].min()
        max_page = story_data['Page_Numeric'].max()
        page_span = max_page - min_page + 1
        total_similes = len(story_data)
        density = total_similes / page_span if page_span > 0 else 0

        story_stats.append({
            'Story': story,
            'Total_Similes': total_similes,
            'Page_Span': page_span,
            'Density': density,
            'Start_Page': min_page,
            'End_Page': max_page
        })

    density_df = pd.DataFrame(story_stats)
    density_df = density_df.sort_values('Density', ascending=True)

    # Create horizontal bar chart
    fig = px.bar(
        density_df,
        x='Density',
        y='Story',
        orientation='h',
        title="Simile Density by Story (Similes per Page)",
        labels={'Density': 'Similes per Page', 'Story': 'Story'},
        color='Density',
        color_continuous_scale='YlOrRd',
        text='Total_Similes'
    )

    fig.update_traces(texttemplate='%{text} total', textposition='outside')
    fig.update_layout(
        width=800,
        height=600,
        font=dict(size=11),
        showlegend=False
    )

    return fig, density_df

def create_progressive_accumulation_chart(data):
    """
    Shows how similes accumulate throughout the book.
    """
    # Sort by page number
    sorted_data = data.sort_values('Page_Numeric')
    sorted_data['Cumulative_Count'] = range(1, len(sorted_data) + 1)

    # Create line chart
    fig = px.line(
        sorted_data,
        x='Page_Numeric',
        y='Cumulative_Count',
        title="Simile Accumulation Throughout Dubliners",
        labels={'Page_Numeric': 'Page Number', 'Cumulative_Count': 'Cumulative Simile Count'},
        hover_data=['Story', 'Category_Framework']
    )

    # Add story boundaries as vertical lines
    story_boundaries = sorted_data.groupby('Story')['Page_Numeric'].min().sort_values()

    for story, start_page in story_boundaries.items():
        fig.add_vline(
            x=start_page,
            line_dash="dash",
            line_color="gray",
            opacity=0.5,
            annotation_text=story[:10] + "...",
            annotation_position="top"
        )

    fig.update_layout(
        width=1200,
        height=500,
        font=dict(size=11)
    )

    return fig

# Execute the comprehensive analysis
def run_complete_distribution_analysis():
    """
    Runs all distribution analyses and saves multiple visualizations.
    """
    # Prepare data
    clean_data = create_comprehensive_distribution_analysis(corpus_dataset)

    if clean_data is None:
        return

    print("\n" + "="*60)
    print("CREATING COMPREHENSIVE DISTRIBUTION VISUALIZATIONS")
    print("="*60)

    # 1. Story-Page Heatmap
    print("1. Creating story-page distribution heatmap...")
    story_page_fig, story_page_matrix = create_story_page_heatmap(clean_data)
    story_page_fig.write_html("story_page_distribution.html")
    story_page_fig.show()

    # 2. Category Distribution Heatmap
    print("2. Creating category distribution heatmap...")
    category_fig, category_matrix = create_category_distribution_heatmap(clean_data)
    category_fig.write_html("category_distribution.html")
    category_fig.show()

    # 3. Dataset Source Comparison
    print("3. Creating dataset source comparison...")
    source_fig = create_dataset_source_comparison(clean_data)
    source_fig.write_html("dataset_source_comparison.html")
    source_fig.show()

    # 4. Simile Density Analysis
    print("4. Creating simile density analysis...")
    density_fig, density_stats = create_simile_density_by_story(clean_data)
    density_fig.write_html("simile_density_analysis.html")
    density_fig.show()

    # 5. Progressive Accumulation
    print("5. Creating progressive accumulation chart...")
    accumulation_fig = create_progressive_accumulation_chart(clean_data)
    accumulation_fig.write_html("simile_accumulation.html")
    accumulation_fig.show()

    # Summary Analysis
    print("\n" + "="*50)
    print("DISTRIBUTION ANALYSIS SUMMARY")
    print("="*50)

    # Overall statistics
    total_similes = len(clean_data)
    total_stories = clean_data['Story'].nunique()
    page_range = clean_data['Page_Numeric'].max() - clean_data['Page_Numeric'].min()
    overall_density = total_similes / page_range

    print(f" Total similes analyzed: {total_similes}")
    print(f" Stories covered: {total_stories}")
    print(f" Page range: {clean_data['Page_Numeric'].min():.0f}-{clean_data['Page_Numeric'].max():.0f}")
    print(f" Overall density: {overall_density:.2f} similes per page")

    # Category breakdown
    print(f"\n Category Distribution:")
    category_counts = clean_data['Category_Framework'].value_counts()
    for category, count in category_counts.items():
        percentage = (count / total_similes) * 100
        print(f"  • {category}: {count} ({percentage:.1f}%)")

    # Story with highest/lowest density
    highest_density_story = density_stats.loc[density_stats['Density'].idxmax()]
    lowest_density_story = density_stats.loc[density_stats['Density'].idxmin()]

    print(f"\n Density Extremes:")
    print(f"  • Highest: {highest_density_story['Story']} ({highest_density_story['Density']:.2f} similes/page)")
    print(f"  • Lowest: {lowest_density_story['Story']} ({lowest_density_story['Density']:.2f} similes/page)")

    # Dataset source breakdown
    print(f"\n Detection Source Breakdown:")
    source_counts = clean_data['Dataset_Source'].value_counts()
    for source, count in source_counts.items():
        percentage = (count / total_similes) * 100
        print(f"  • {source}: {count} ({percentage:.1f}%)")

    print(f"\n All visualizations saved as HTML files!")
    print(f" Files created:")
    print(f"  • story_page_distribution.html")
    print(f"  • category_distribution.html")
    print(f"  • dataset_source_comparison.html")
    print(f"  • simile_density_analysis.html")
    print(f"  • simile_accumulation.html")

# Run the complete analysis
run_complete_distribution_analysis()

Dataset loaded: 612 records
Creating comprehensive simile distribution analysis...
Analyzing 171 similes with complete data
Page range: 7 to 256
Stories: ['A Little Cloud', 'A Mother', 'A Painful Case', 'After The Race', 'An Encounter', 'Araby', 'Clay', 'Counterparts', 'Eveline', 'Grace', 'Ive Day In The Committee Room', 'The Boarding House', 'The Dead', 'The Sisters', 'Two Gallants']
Categories: ['Joycean_Framed', 'Joycean_Quasi', 'Joycean_Quasi_Fuzzy', 'Joycean_Silent', 'Standard']

CREATING COMPREHENSIVE DISTRIBUTION VISUALIZATIONS
1. Creating story-page distribution heatmap...


2. Creating category distribution heatmap...


3. Creating dataset source comparison...


4. Creating simile density analysis...


5. Creating progressive accumulation chart...



DISTRIBUTION ANALYSIS SUMMARY
 Total similes analyzed: 171
 Stories covered: 15
 Page range: 7-256
 Overall density: 0.69 similes per page

 Category Distribution:
  • Standard: 89 (52.0%)
  • Joycean_Quasi: 49 (28.7%)
  • Joycean_Framed: 16 (9.4%)
  • Joycean_Quasi_Fuzzy: 12 (7.0%)
  • Joycean_Silent: 5 (2.9%)

 Density Extremes:
  • Highest: The Sisters (1.55 similes/page)
  • Lowest: A Painful Case (0.38 similes/page)

 Detection Source Breakdown:
  • Manual_Annotation: 171 (100.0%)

 All visualizations saved as HTML files!
 Files created:
  • story_page_distribution.html
  • category_distribution.html
  • dataset_source_comparison.html
  • simile_density_analysis.html
  • simile_accumulation.html


## Visualization Outputs

The execution of this notebook generates:

1. **`joyce_simile_dashboard.html`** - Complete interactive dashboard
2. **Individual visualization files** - Separate HTML files for each analysis
3. **Statistical validation** - Wilson Score intervals and significance testing
4. **Publication-ready figures** - Academic quality visualizations

## Usage for Thesis

- Use the dashboard for comprehensive presentation of findings
- Individual plots can be embedded in specific thesis sections
- All visualizations demonstrate Joyce's stylistic innovations with statistical evidence
- Interactive elements allow detailed exploration of patterns

In [None]:
import pandas as pd
import numpy as np
import requests
import re
from pathlib import Path
from IPython.display import HTML, display
import os

# Load the comprehensive simile dataset
corpus_dataset = pd.read_csv('/comprehensive_linguistic_analysis.csv')
print(f"Simile dataset loaded: {len(corpus_dataset)} records")

def download_dubliners_text():
    """Download Dubliners from Project Gutenberg."""
    url = "https://www.gutenberg.org/files/2814/2814-0.txt"

    try:
        print("Downloading Dubliners from Project Gutenberg...")
        response = requests.get(url)
        response.raise_for_status()
        text = response.text

        # Clean the text
        start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
        end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
        end_marker_alt = "*** END OF THIS PROJECT GUTENBERG EBOOK"

        if start_marker in text:
            text = text.split(start_marker)[1]
        if end_marker in text:
            text = text.split(end_marker)[0]
        elif end_marker_alt in text:
            text = text.split(end_marker_alt)[0]

        print(f"Downloaded {len(text):,} characters")
        return text

    except Exception as e:
        print(f"Error downloading text: {e}")
        return None

def get_highlight_color(category, confidence=0.8):
    """Get CSS color for highlighting based on category and confidence."""
    # Base colors by category complexity
    color_map = {
        'Standard': {'hue': 60, 'name': 'Standard (Yellow-Green)'},
        'Joycean_Quasi_Fuzzy': {'hue': 45, 'name': 'Quasi-Fuzzy (Orange)'},
        'Joycean_Quasi': {'hue': 30, 'name': 'Quasi (Orange-Red)'},
        'Joycean_Silent': {'hue': 15, 'name': 'Silent (Red-Orange)'},
        'Joycean_Hybrid': {'hue': 0, 'name': 'Hybrid (Red)'},
        'Joycean_Complex': {'hue': 330, 'name': 'Complex (Magenta)'},
        'Joycean_Framed': {'hue': 280, 'name': 'Framed (Purple)'}
    }

    category_str = str(category) if not pd.isna(category) else 'Unknown'

    if category_str not in color_map:
        return {'background': 'rgba(128, 128, 128, 0.3)', 'border': 'gray', 'name': 'Unknown'}

    hue = color_map[category_str]['hue']
    saturation = int(50 + (confidence * 30))
    lightness = int(85 - (confidence * 15))
    alpha = 0.3 + (confidence * 0.4)

    background = f"hsla({hue}, {saturation}%, {lightness}%, {alpha})"
    border_color = f"hsl({hue}, {saturation + 20}%, {lightness - 20}%)"

    return {
        'background': background,
        'border': border_color,
        'name': color_map[category_str]['name'],
        'confidence': confidence
    }

def extract_main_content(text):
    """Extract the main story content, excluding table of contents and prefaces."""
    # Look for the start of the first story "THE SISTERS"
    patterns_to_try = [
        r'\n\s*THE SISTERS\s*\n\s*\n',  # THE SISTERS followed by double newline
        r'\n\s*THE SISTERS\s*\n[^\n]*\n',  # THE SISTERS followed by any content then newline
        r'THE SISTERS\s*\n\s*There was no hope',  # THE SISTERS followed by the actual story text
        r'THE SISTERS\s*\n\s*\n\s*There was no hope'  # With extra whitespace
    ]

    for pattern in patterns_to_try:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            # Find where "THE SISTERS" actually starts in the match
            sisters_pos = text.lower().find("the sisters", match.start())
            if sisters_pos != -1:
                print(f"Main content starts at position {sisters_pos} using pattern: {pattern}")
                return text[sisters_pos:], sisters_pos

    # Fallback: look for "THE SISTERS" that's followed by story content
    sisters_matches = []
    for match in re.finditer(r'THE SISTERS', text, re.IGNORECASE):
        start_pos = match.start()
        # Check what comes after - should be story content, not more titles
        following_text = text[start_pos:start_pos + 200].lower()
        if 'there was no hope' in following_text or 'night after night' in following_text:
            sisters_matches.append(start_pos)

    if sisters_matches:
        start_pos = sisters_matches[-1]  # Use the last match (likely the actual story)
        print(f"Main content starts at position {start_pos} (fallback method)")
        return text[start_pos:], start_pos

    print("Could not find 'THE SISTERS' story start - using full text")
    return text, 0

def match_similes_to_text(simile_data, full_text):
    """Match similes from the dataset to their positions in the full text."""
    print("Matching manually annotated similes to text positions...")
    print(f"Total similes to match: {len(simile_data)}")

    # Extract main content (stories only, not table of contents)
    main_content, content_offset = extract_main_content(full_text)

    matched_similes = []
    text_lower = main_content.lower()
    text_clean = re.sub(r'\s+', ' ', text_lower)

    successful_matches = 0

    for idx, row in simile_data.iterrows():
        if pd.isna(row.get('Sentence_Context')):
            continue

        sentence = str(row['Sentence_Context']).strip()
        sentence_clean = re.sub(r'\s+', ' ', sentence).strip()

        if len(sentence_clean) < 5:
            continue

        sentence_lower = sentence_clean.lower()
        start_pos = -1
        match_method = "none"

        # Strategy 1: Exact match
        start_pos = text_lower.find(sentence_lower)
        if start_pos != -1:
            match_method = "exact"

        # Strategy 2: Exact match on cleaned text
        if start_pos == -1:
            start_pos = text_clean.find(sentence_lower)
            if start_pos != -1:
                match_method = "cleaned"

        # Strategy 3: Remove punctuation and try again
        if start_pos == -1:
            sentence_no_punct = re.sub(r'[^\w\s]', '', sentence_lower)
            text_no_punct = re.sub(r'[^\w\s]', '', text_lower)
            match_pos = text_no_punct.find(sentence_no_punct)
            if match_pos != -1:
                # Approximate position mapping
                words_before = len(text_no_punct[:match_pos].split())
                original_words = text_lower.split()
                if words_before < len(original_words):
                    word_positions = []
                    current_pos = 0
                    for word in original_words:
                        word_start = text_lower.find(word, current_pos)
                        if word_start != -1:
                            word_positions.append(word_start)
                            current_pos = word_start + len(word)

                    if words_before < len(word_positions):
                        start_pos = word_positions[words_before]
                        match_method = "no_punct"

        # Strategy 4: Try first few words
        if start_pos == -1:
            words = sentence_lower.split()
            if len(words) >= 3:
                for word_count in [min(15, len(words)), min(10, len(words)), min(7, len(words)), min(5, len(words))]:
                    partial_sentence = ' '.join(words[:word_count])
                    partial_pos = text_lower.find(partial_sentence)
                    if partial_pos != -1:
                        start_pos = partial_pos
                        match_method = f"first_{word_count}_words"
                        break

        # Strategy 5: Try substring search with key phrases
        if start_pos == -1:
            words = sentence_lower.split()
            if len(words) >= 5:
                # Try 5-word sliding window
                for i in range(len(words) - 4):
                    phrase = ' '.join(words[i:i+5])
                    phrase_pos = text_lower.find(phrase)
                    if phrase_pos != -1:
                        start_pos = phrase_pos
                        match_method = "sliding_window"
                        break

        # Strategy 6: Try without leading/trailing words
        if start_pos == -1:
            words = sentence_lower.split()
            if len(words) > 6:
                # Try removing first and last word
                middle_sentence = ' '.join(words[1:-1])
                middle_pos = text_lower.find(middle_sentence)
                if middle_pos != -1:
                    start_pos = middle_pos
                    match_method = "middle_words"

        if start_pos != -1:
            successful_matches += 1

            # Calculate end position - use original sentence length but adjust to word boundaries
            end_pos = start_pos + len(sentence_clean)

            # Adjust to word boundaries
            while start_pos > 0 and main_content[start_pos - 1].isalnum():
                start_pos -= 1

            while end_pos < len(main_content) and main_content[end_pos].isalnum():
                end_pos += 1

            # Ensure we don't go beyond text length
            end_pos = min(end_pos, len(main_content))

            # Add content offset to get position in full text
            final_start = start_pos + content_offset
            final_end = end_pos + content_offset

            matched_similes.append({
                'start': final_start,
                'end': final_end,
                'sentence': sentence,
                'category': row.get('Category_Framework', 'Unknown'),
                'story': row.get('Story', 'Unknown'),
                'page': row.get('Page_Number', 'Unknown'),
                'comparator': row.get('Comparator_Type', 'Unknown'),
                'color_info': get_highlight_color(row.get('Category_Framework', 'Unknown')),
                'dataset_source': row.get('Dataset_Source', 'Unknown'),
                'match_method': match_method
            })

    # Sort by position in text
    matched_similes.sort(key=lambda x: x['start'])

    print(f"Successfully matched {successful_matches} out of {len(simile_data)} similes")
    if len(simile_data) > 0:
        print(f"Match rate: {successful_matches/len(simile_data)*100:.1f}%")

    # Show matching strategy breakdown
    if matched_similes:
        from collections import Counter
        strategy_counts = Counter([s['match_method'] for s in matched_similes])
        print("Matching strategies used:")
        for strategy, count in strategy_counts.most_common():
            print(f"  {strategy}: {count}")

    return matched_similes

def split_into_stories_with_positions(text):
    """Split text into stories and track their positions."""
    story_titles = [
        "THE SISTERS", "AN ENCOUNTER", "ARABY", "EVELINE", "AFTER THE RACE",
        "TWO GALLANTS", "THE BOARDING HOUSE", "A LITTLE CLOUD", "COUNTERPARTS",
        "CLAY", "A PAINFUL CASE", "IVY DAY IN THE COMMITTEE ROOM",
        "A MOTHER", "GRACE", "THE DEAD"
    ]

    stories = []
    current_pos = 0

    for i, title in enumerate(story_titles):
        match = re.search(re.escape(title), text[current_pos:], re.IGNORECASE)

        if match:
            start_pos = current_pos + match.start()
            end_pos = len(text)

            if i + 1 < len(story_titles):
                next_title = story_titles[i + 1]
                next_match = re.search(re.escape(next_title), text[start_pos + len(title):], re.IGNORECASE)
                if next_match:
                    end_pos = start_pos + len(title) + next_match.start()

            stories.append({
                'title': title,
                'start': start_pos,
                'end': end_pos,
                'text': text[start_pos:end_pos]
            })
            current_pos = start_pos + len(title)

    return stories

def apply_simile_highlighting(text, matched_similes):
    """Apply highlighting to text using a more robust approach."""
    if not matched_similes:
        return text

    # Sort similes by start position to ensure correct processing
    sorted_similes = sorted(matched_similes, key=lambda x: x['start'])

    result = ""
    last_pos = 0

    for simile in sorted_similes:
        start = simile['start']
        end = simile['end']

        # Skip overlapping similes
        if start < last_pos:
            continue

        # Add text before this simile
        if start > last_pos:
            result += text[last_pos:start]

        # Add the highlighted simile
        simile_text = text[start:end]
        category_class = str(simile.get('category', 'Unknown')).replace('_', '-').lower()
        color_info = simile['color_info']

        tooltip_text = f"Category: {simile.get('category', 'N/A')}<br>Comparator: {simile.get('comparator', 'N/A')}<br>Dataset: {simile.get('dataset_source', 'N/A')}"

        # Escape HTML in the simile text
        escaped_text = simile_text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

        result += f'<span class="simile-highlight simile-{category_class}" style="background: {color_info["background"]}; border-left-color: {color_info["border"]};" data-category="{simile.get("category", "Unknown")}">{escaped_text}<div class="tooltip">{tooltip_text}</div></span>'

        last_pos = end

    # Add remaining text
    if last_pos < len(text):
        result += text[last_pos:]

    return result

def create_interactive_html(text, matched_similes):
    """Create an interactive HTML page with the full text and simile highlighting."""
    print("Creating interactive HTML visualization...")

    # Calculate statistics
    simile_count = len(matched_similes)
    category_stats = {}
    for simile in matched_similes:
        category = simile.get('category', 'Unknown')
        category_stats[category] = category_stats.get(category, 0) + 1

    # Split into stories for navigation
    stories = split_into_stories_with_positions(text)

    # Apply highlighting
    highlighted_text = apply_simile_highlighting(text, matched_similes)

    # Add HTML anchors to story titles in the text
    story_id_map = {
        "THE SISTERS": "sisters", "AN ENCOUNTER": "an-encounter", "ARABY": "araby",
        "EVELINE": "eveline", "AFTER THE RACE": "after-race", "TWO GALLANTS": "two-gallants",
        "THE BOARDING HOUSE": "boarding-house", "A LITTLE CLOUD": "a-little-cloud",
        "COUNTERPARTS": "counterparts", "CLAY": "clay", "A PAINFUL CASE": "a-painful-case",
        "IVY DAY IN THE COMMITTEE ROOM": "ivy-day-in-committee-room", "A MOTHER": "a-mother",
        "GRACE": "grace", "THE DEAD": "dead"
    }

    text_with_anchors = highlighted_text # Start with highlighted text

    # Iterate through stories to insert anchors
    # Process in reverse order to avoid position shifts affecting subsequent replacements
    for story in reversed(stories):
        title = story['title']
        story_id = story_id_map.get(title, title.lower().replace(' ', '-'))
        start_pos = story['start'] # Use the start position from split_into_stories_with_positions

        # Find the actual text of the title in the highlighted text
        # We need to be careful because highlighting might have added spans
        # A robust way is to find the raw title text and insert the anchor before it

        # Find the position of the raw title text in the original full text
        raw_title_match = re.search(re.escape(title), text, re.IGNORECASE)

        if raw_title_match:
             raw_title_start_in_full_text = raw_title_match.start()

             # Now find this position in the highlighted text. This is tricky with spans.
             # A simpler approach is to recreate the highlighted text with anchors inserted during the segment processing.
             # However, given the current structure, let's try inserting directly, being mindful of potential issues.

             # Let's assume the highlighted text still closely maps to original positions for now.
             # This might require more complex logic if overlaps or complex HTML are within titles.
             # For a simple insertion, find the title text in the highlighted string.
             # This is risky but a quick fix attempt. A better approach is to build the HTML differently.

             # Let's try a simpler strategy: find the first occurrence of the title text in the *highlighted* text
             # and insert the anchor there. This might fail if the title itself is highlighted.

             # Find the title in the highlighted text
             title_in_highlighted_match = highlighted_text.find(title) # Case-sensitive might be better here

             if title_in_highlighted_match != -1:
                 anchor_html = f'<div id="{story_id}" class="story-title-anchor"></div>'
                 # Insert the anchor *before* the title text in the highlighted content
                 text_with_anchors = text_with_anchors[:title_in_highlighted_match] + anchor_html + text_with_anchors[title_in_highlighted_match:]
                 print(f"Inserted anchor for '{title}' at position {title_in_highlighted_match}")
             else:
                 print(f"Warning: Could not find '{title}' in highlighted text to insert anchor.")


    # Build HTML content
    html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Dubliners: Interactive Simile Visualization</title>
    <style>
        body {{
            font-family: 'Georgia', serif;
            line-height: 1.7;
            max-width: 1000px;
            margin: 0 auto;
            padding: 20px;
            background-color: #f8f9fa;
            color: #333;
        }}

        .header {{
            text-align: center;
            margin-bottom: 30px;
            background: white;
            padding: 30px;
            border-radius: 15px;
            box-shadow: 0 4px 20px rgba(0,0,0,0.1);
        }}

        .header h1 {{
            color: #2c3e50;
            margin-bottom: 10px;
            font-size: 2.5em;
        }}

        .header p {{
            color: #666;
            font-style: italic;
            font-size: 1.1em;
            margin-bottom: 20px;
        }}

        .stats {{
            display: flex;
            justify-content: center;
            gap: 30px;
            margin-top: 20px;
            flex-wrap: wrap;
        }}

        .stat-item {{
            text-align: center;
        }}

        .stat-number {{
            font-size: 2em;
            font-weight: bold;
            color: #e74c3c;
            display: block;
        }}

        .stat-label {{
            font-size: 0.9em;
            color: #666;
        }}

        .controls {{
            background: white;
            padding: 25px;
            border-radius: 15px;
            box-shadow: 0 4px 20px rgba(0,0,0,0.1);
            margin-bottom: 30px;
            position: sticky;
            top: 10px;
            z-index: 100;
        }}

        .legend {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 15px;
            margin-bottom: 20px;
        }}

        .legend-item {{
            display: flex;
            align-items: center;
            gap: 10px;
            padding: 10px 15px;
            border-radius: 25px;
            background: #f8f9fa;
            border: 2px solid #dee2e6;
            transition: transform 0.2s ease;
        }}

        .legend-item:hover {{
            transform: translateY(-2px);
            box-shadow: 0 4px 10px rgba(0,0,0,0.1);
        }}

        .legend-color {{
            width: 25px;
            height: 25px;
            border-radius: 50%;
            border: 2px solid #333;
            flex-shrink: 0;
        }}

        .legend-text {{
            font-weight: 500;
            color: #333;
            flex-grow: 1;
            word-break: break-word;
        }}

        .legend-count {{
            margin-left: auto;
            background: #e9ecef;
            padding: 3px 8px;
            border-radius: 12px;
            font-size: 0.85em;
            font-weight: bold;
            flex-shrink: 0;
        }}

        .filter-controls {{
            display: flex;
            justify-content: center;
            gap: 10px;
            flex-wrap: wrap;
        }}

        .filter-button {{
            padding: 10px 20px;
            border: 2px solid #007bff;
            background: white;
            color: #007bff;
            border-radius: 25px;
            cursor: pointer;
            font-weight: 500;
            transition: all 0.3s ease;
            user-select: none;
        }}

        .filter-button.active {{
            background: #007bff;
            color: white;
            transform: scale(1.05);
        }}

        .filter-button:hover {{
            transform: translateY(-2px);
            box-shadow: 0 4px 10px rgba(0,123,255,0.3);
        }}

        .story-navigation {{
            background: white;
            padding: 20px;
            border-radius: 15px;
            box-shadow: 0 4px 20px rgba(0,0,0,0.1);
            margin-bottom: 30px;
        }}

        .story-links {{
            display: flex;
            flex-wrap: wrap;
            gap: 10px;
            justify-content: center;
        }}

        .story-link {{
            padding: 8px 15px;
            background: #f8f9fa;
            border: 1px solid #dee2e6;
            border-radius: 20px;
            text-decoration: none;
            color: #495057;
            font-size: 0.9em;
            transition: all 0.2s ease;
        }}

        .story-link:hover {{
            background: #e9ecef;
            transform: translateY(-1px);
        }}

        .text-container {{
            background: white;
            padding: 40px;
            border-radius: 15px;
            box-shadow: 0 4px 20px rgba(0,0,0,0.1);
            font-size: 1.1em;
            line-height: 1.8;
            white-space: pre-wrap;
            word-wrap: break-word;
        }}

        .story-title {{
            font-size: 1.5em;
            font-weight: bold;
            text-align: center;
            margin: 40px 0 20px 0;
            color: #2c3e50;
            border-bottom: 2px solid #3498db;
            padding-bottom: 10px;
            /* Add scroll-margin-top to prevent sticky header from covering title */
            scroll-margin-top: 100px; /* Adjust based on your sticky header height */
        }}

        .story-title-anchor {{
             display: block;
             position: relative;
             top: -90px; /* Adjust based on sticky header height */
             visibility: hidden;
        }}


        .simile-highlight {{
            padding: 2px 4px;
            border-radius: 4px;
            border-left: 3px solid;
            display: inline;
            box-shadow: 0 1px 3px rgba(0,0,0,0.1);
            transition: all 0.3s ease;
            cursor: pointer;
            position: relative;
        }}

        .simile-highlight:hover {{
            transform: scale(1.02);
            box-shadow: 0 2px 8px rgba(0,0,0,0.2);
            z-index: 10;
        }}

        .tooltip {{
            position: absolute;
            bottom: 100%;
            left: 50%;
            transform: translateX(-50%);
            background: #333;
            color: white;
            padding: 8px 12px;
            border-radius: 6px;
            font-size: 0.85em;
            white-space: nowrap;
            opacity: 0;
            visibility: hidden;
            pointer-events: none;
            transition: opacity 0.3s ease, visibility 0.3s ease;
            z-index: 1000;
            min-width: 150px;
            text-align: center;
        }}

        .simile-highlight:hover .tooltip {{
            opacity: 1;
            visibility: visible;
        }}

        .tooltip::after {{
            content: '';
            position: absolute;
            top: 100%;
            left: 50%;
            transform: translateX(-50%);
            border: 5px solid transparent;
            border-top-color: #333;
        }}

        .dimmed {{
            opacity: 0.4;
            filter: grayscale(50%);
        }}

        @media (max-width: 768px) {{
            body {{
                padding: 10px;
            }}

            .header h1 {{
                font-size: 2em;
            }}

            .stats {{
                gap: 15px;
            }}

            .legend {{
                grid-template-columns: 1fr;
            }}

            .text-container {{
                padding: 20px;
            }}

            .filter-button {{
                padding: 8px 15px;
                font-size: 0.9em;
            }}

            .story-link {{
                font-size: 0.8em;
            }}
        }}
    </style>
</head>
<body>
    <div class="header">
        <h1>Dubliners: Interactive Simile Analysis</h1>
        <p>
            Explore Joyce's innovative use of similes through manual annotation and visual highlighting.
            Each simile is color-coded by stylistic complexity and theoretical category.
        </p>
        <div class="stats">
            <div class="stat-item">
                <span class="stat-number">{simile_count:,}</span>
                <span class="stat-label">Similes Identified</span>
            </div>
            <div class="stat-item">
                <span class="stat-number">{len(category_stats)}</span>
                <span class="stat-label">Categories</span>
            </div>
            <div class="stat-item">
                <span class="stat-number">{len(stories)}</span>
                <span class="stat-label">Stories</span>
            </div>
        </div>
    </div>

    <div class="controls">
        <div class="legend">"""

    # Add legend items
    sorted_categories = sorted([str(k) for k in category_stats.keys()])
    for category in sorted_categories:
        count = category_stats.get(category, 0)
        color_info = get_highlight_color(category, 0.8)

        html_content += f"""
            <div class="legend-item">
                <div class="legend-color" style="background: {color_info['background']}; border-color: {color_info['border']};"></div>
                <span class="legend-text">{color_info['name']}</span>
                <span class="legend-count">{count}</span>
            </div>"""

    html_content += f"""
        </div>

        <div class="filter-controls">
            <button class="filter-button active" data-filter="all">Show All</button>
            <button class="filter-button" data-filter="Standard">Standard Only</button>
            <button class="filter-button" data-filter="Joycean">Joycean Only</button>
            <button class="filter-button" data-filter="none">Hide All Similes</button>
        </div>
    </div>

    <div class="story-navigation">
        <div class="story-links">"""

    # Add story navigation links - use data-story attribute for JS
    story_id_map = {
        "THE SISTERS": "sisters", "AN ENCOUNTER": "an-encounter", "ARABY": "araby",
        "EVELINE": "eveline", "AFTER THE RACE": "after-race", "TWO GALLANTS": "two-gallants",
        "THE BOARDING HOUSE": "boarding-house", "A LITTLE CLOUD": "a-little-cloud",
        "COUNTERPARTS": "counterparts", "CLAY": "clay", "A PAINFUL CASE": "a-painful-case",
        "IVY DAY IN THE COMMITTEE ROOM": "ivy-day-in-committee-room", "A MOTHER": "a-mother",
        "GRACE": "grace", "THE DEAD": "dead"
    }
    for story in stories:
         story_id = story_id_map.get(story['title'], story['title'].lower().replace(' ', '-')) # Fallback ID
         html_content += f'<span class="story-link" data-story="{story_id}">{story["title"].replace("THE ", "").title()}</span>'


    html_content += f"""
        </div>
    </div>

    <div class="text-container">
        {text_with_anchors}
    </div>

    <script>
        document.addEventListener('DOMContentLoaded', function() {{
            const filterButtons = document.querySelectorAll('.filter-button');
            const similes = document.querySelectorAll('.simile-highlight');

            filterButtons.forEach(button => {{
                button.addEventListener('click', function() {{
                    const filterCategory = this.getAttribute('data-filter');

                    // Update button states
                    filterButtons.forEach(btn => btn.classList.remove('active'));
                    this.classList.add('active');

                    // Apply filter
                    similes.forEach(simile => {{
                        const simileCategory = simile.getAttribute('data-category');
                        const isJoycean = simileCategory && simileCategory.startsWith('Joycean');

                        if (filterCategory === 'all') {{
                            simile.classList.remove('dimmed');
                        }} else if (filterCategory === 'none') {{
                            simile.classList.add('dimmed');
                        }} else if (filterCategory === 'Joycean' && isJoycean) {{
                            simile.classList.remove('dimmed');
                        }} else if (simileCategory === filterCategory) {{
                            simile.classList.remove('dimmed');
                        }} else {{
                            simile.classList.add('dimmed');
                        }}
                    }});
                }});
            }});

            // Smooth scrolling for story navigation
            document.querySelectorAll('.story-link').forEach(link => {{
                link.addEventListener('click', function(e) {{
                    e.preventDefault();
                    const targetId = this.getAttribute('data-story');
                    const targetElement = document.getElementById(targetId);
                    if (targetElement) {{
                        targetElement.scrollIntoView({{ behavior: 'smooth', block: 'start' }});
                    }} else {{
                         console.warn('Target element not found:', targetId);
                    }}
                }});
            }});
        }});
    </script>
</body>
</html>"""

    return html_content

def create_and_save_visualization():
    """Main function to create and save the Dubliners simile visualization."""
    print("Creating Dubliners Interactive Simile Text Visualization...")
    print("=" * 60)

    # Use only manually annotated similes
    dubliners_data = corpus_dataset[
        corpus_dataset['Dataset_Source'] == 'Manual_Annotation'
    ].copy()

    print(f"Using {len(dubliners_data)} manually annotated similes")

    if len(dubliners_data) == 0:
        print("No manual annotations found - using all non-BNC data")
        dubliners_data = corpus_dataset[
            corpus_dataset['Original_Dataset'] != 'bnc'
        ].copy()
        print(f"Using {len(dubliners_data)} similes from non-BNC sources")

    # Download the full text
    full_text = download_dubliners_text()
    if not full_text:
        print("Failed to download text")
        return None

    # Match similes to text positions
    matched_similes = match_similes_to_text(dubliners_data, full_text)

    if not matched_similes:
        print("No similes could be matched to the text")
        return None

    # Create the interactive HTML
    html_content = create_interactive_html(full_text, matched_similes)

    # Save to file
    output_file = "dubliners_simile_highlighting_final.html"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"Interactive text visualization saved to: {output_file}")

    # Print summary statistics
    print("\nVisualization Summary:")
    print(f"  Dubliners similes in dataset: {len(dubliners_data)}")
    print(f"  Successfully matched to text: {len(matched_similes)}")
    if len(dubliners_data) > 0:
        print(f"  Match rate: {len(matched_similes)/len(dubliners_data)*100:.1f}%")
    print(f"  Full text length: {len(full_text):,} characters")

    category_counts = {}
    for simile in matched_similes:
        cat = str(simile.get('category', 'Unknown')) if not pd.isna(simile.get('category')) else 'Unknown'
        category_counts[cat] = category_counts.get(cat, 0) + 1

    print("  Categories visualized:")
    sorted_categories = sorted(category_counts.items(), key=lambda item: str(item[0]))

    for category, count in sorted_categories:
        print(f"    {category}: {count} instances")

    print("\nKey improvements applied:")
    features_list = [
        "Enhanced main content extraction (excludes table of contents)",
        "Multiple fallback matching strategies for better coverage",
        "Improved word boundary detection",
        "Robust HTML text processing",
        "Clean simile highlighting without text flow disruption",
        "Better handling of overlapping matches",
        "Working story navigation links" # Added this feature
    ]
    for feature in features_list:
        print(f"  - {feature}")

    return output_file

# Execute the visualization
if __name__ == "__main__" or True:
    print("STARTING FINAL DUBLINERS VISUALIZATION")
    print("=" * 50)

    try:
        output_filename = create_and_save_visualization()

        if output_filename and os.path.exists(output_filename):
            print(f"\nSUCCESS: Final HTML file created!")
            print(f"File location: {os.path.abspath(output_filename)}")
            print(f"File size: {os.path.getsize(output_filename):,} bytes")

            # Force download in Colab/Jupyter environment
            try:
                from google.colab import files
                print("\nInitiating automatic download...")
                files.download(output_filename)
                print("Download initiated - check your browser downloads folder")
            except ImportError:
                print("\nFile saved successfully - you can:")
                print("1. Check your file browser panel")
                print("2. Download manually from the file list")

        else:
            print("ERROR: Failed to create HTML file")

    except Exception as e:
        print(f"ERROR during visualization creation: {e}")
        import traceback
        traceback.print_exc()

Simile dataset loaded: 612 records
STARTING FINAL DUBLINERS VISUALIZATION
Creating Dubliners Interactive Simile Text Visualization...
Using 194 manually annotated similes
Downloading Dubliners from Project Gutenberg...
Downloaded 377,717 characters
Matching manually annotated similes to text positions...
Total similes to match: 194
Main content starts at position 304 using pattern: \n\s*THE SISTERS\s*\n\s*\n
Successfully matched 183 out of 194 similes
Match rate: 94.3%
Matching strategies used:
  cleaned: 108
  sliding_window: 25
  exact: 20
  first_10_words: 15
  first_7_words: 7
  no_punct: 4
  first_5_words: 4
Creating interactive HTML visualization...
Inserted anchor for 'THE DEAD' at position 328185
Inserted anchor for 'GRACE' at position 279113
Inserted anchor for 'A MOTHER' at position 250549
Inserted anchor for 'IVY DAY IN THE COMMITTEE ROOM' at position 217772
Inserted anchor for 'A PAINFUL CASE' at position 196337
Inserted anchor for 'CLAY' at position 180930
Inserted anchor 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download initiated - check your browser downloads folder


In [10]:
# Check the root directory
!ls /

# Check the content directory where files are often uploaded
!ls /content/

bin			    kaggle		      opt		 sys
boot			    lib			      proc		 tmp
content			    lib32		      python-apt	 tools
cuda-keyring_1.1-1_all.deb  lib64		      python-apt.tar.xz  usr
datalab			    libx32		      root		 var
dev			    media		      run
etc			    mnt			      sbin
home			    NGC-DL-CONTAINER-LICENSE  srv
comprehensive_linguistic_analysis.csv  sample_data
