# Emotion Visualization in Austrian Parliament Speeches

Load the base data and calculate emotion scores.

In [8]:
# Import necessary libraries for emotion analysis
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm # Import tqdm for progress bar
tqdm.pandas() # Enable progress_apply for pandas

# Initialize the emotion classification pipeline
# Using a multilingual model suitable for German text
try:
    # Consider using a multilingual model if your text is German
    # Example: model="MilaNLProc/xlm-roberta-base-sentiment-analysis" (check labels)
    # Example: model="bert-base-multilingual-cased" (needs fine-tuning for emotions)
    # Using the English model for now as specified
    # Updated to use top_k=None instead of deprecated return_all_scores=True
    emotion_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None, device=0) # Use GPU if available
    print("Emotion classification pipeline loaded successfully.")
except Exception as e:
    print(f"Error loading emotion classification pipeline: {e}")
    emotion_classifier = None

# Function to get emotion scores for a given text
def get_emotion_scores(text):
    if not emotion_classifier or not isinstance(text, str) or not text.strip():
        # Return default scores if pipeline failed or text is invalid/empty
        return {'joy': 0.0, 'sadness': 0.0, 'fear': 0.0, 'disgust': 0.0, 'anger': 0.0, 'surprise': 0.0}
    try:
        # The pipeline might return a list of lists of dictionaries
        # Handle potential nested list structure
        # When top_k=None, the output is a list containing one list of dictionaries
        results = emotion_classifier(text, truncation=True, max_length=512)
        if isinstance(results, list) and len(results) > 0 and isinstance(results[0], list):
             scores_list = results[0]
        else:
             # This case might be less likely with top_k=None but kept for robustness
             scores_list = results
        scores = {item['label']: item['score'] for item in scores_list}
        # Ensure all expected emotions are present, default to 0.0 if missing
        expected_emotions = ['joy', 'sadness', 'fear', 'disgust', 'anger', 'surprise']
        final_scores = {emo: scores.get(emo, 0.0) for emo in expected_emotions}
        return final_scores
    except Exception as e:
        # print(f"Error during emotion classification for text: '{str(text)[:50]}...': {e}") # Reduce verbosity
        # Return default scores on error
        return {'joy': 0.0, 'sadness': 0.0, 'fear': 0.0, 'disgust': 0.0, 'anger': 0.0, 'surprise': 0.0}

Device set to use cpu


Emotion classification pipeline loaded successfully.


In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the base data (without pre-calculated emotions)
# Ensure this file exists and contains the 'Text' column and other metadata
try:
    # Added low_memory=False to address DtypeWarning
    base_df = pd.read_csv('AT_NC.csv', low_memory=False)
    print(f"Loaded AT_NC.csv with shape: {base_df.shape}")
    # Check for 'Text' column needed for analysis
    if 'Text' not in base_df.columns:
         raise ValueError("Error: 'Text' column not found in AT_NC.csv.")
except FileNotFoundError:
    print("Error: AT_NC.csv not found. Please ensure it's in the correct directory.")
    base_df = pd.DataFrame() # Create empty df to avoid errors
except ValueError as ve:
    print(ve)
    base_df = pd.DataFrame()

# --- Calculate Emotion Scores ---
final_df_extended = pd.DataFrame() # Initialize final dataframe
if not base_df.empty and emotion_classifier:
    print("Calculating emotion scores... This may take a while.")
    # Apply the function with progress bar
    # Ensure 'Text' column is string type and handle NaNs
    base_df['Text'] = base_df['Text'].astype(str).fillna('')
    emotion_results = base_df['Text'].progress_apply(get_emotion_scores)

    # Convert the series of dictionaries into a DataFrame
    emotion_df = pd.DataFrame(emotion_results.tolist(), index=base_df.index)

    # Concatenate the emotion scores with the original DataFrame
    final_df_extended = pd.concat([base_df, emotion_df], axis=1)
    print(f"Emotion scores calculated. DataFrame shape: {final_df_extended.shape}")
elif not base_df.empty:
    print("Emotion classifier not loaded. Skipping emotion calculation.")
    final_df_extended = base_df # Proceed without emotion scores if classifier failed
else:
    print("Base DataFrame is empty. Cannot calculate emotions.")

# --- Post-processing (Date parsing, etc.) ---
# Basic check for required metadata columns (adjust as needed)
required_cols = ['Date', 'Party_orientation', 'Speaker_party', 'Party_status']
if not final_df_extended.empty:
    missing_cols = [col for col in required_cols if col not in final_df_extended.columns]
    if missing_cols:
        print(f"Warning: Missing required metadata columns: {missing_cols}")
    else:
        print("Required metadata columns found.")

    # Handle potential parsing errors if Date wasn't saved correctly
    if 'Date' in final_df_extended.columns:
        final_df_extended['Date'] = pd.to_datetime(final_df_extended['Date'], errors='coerce')
        # Drop rows where date parsing failed if any
        original_rows = len(final_df_extended)
        final_df_extended.dropna(subset=['Date'], inplace=True)
        if len(final_df_extended) < original_rows:
            print(f"Dropped {original_rows - len(final_df_extended)} rows due to invalid date format.")
    else:
        print("Warning: 'Date' column not found for parsing.")

Loaded AT_NC.csv with shape: (106762, 24)
Calculating emotion scores... This may take a while.


  0%|          | 0/106762 [00:00<?, ?it/s]

KeyboardInterrupt: 

Optional data cleaning step (e.g., dropping specific rows if identified as problematic).

In [2]:
if not final_df_extended.empty:
    # Example: drop row with index 26745 if it exists and is problematic
    problematic_index = 26745
    if problematic_index in final_df_extended.index:
        print(f"Dropping row with index {problematic_index}.")
        final_df_extended = final_df_extended.drop(problematic_index)
    else:
        print(f"Row with index {problematic_index} not found, skipping drop.")
else:
    print("DataFrame is empty, skipping optional row drop.")

DataFrame is empty, skipping optional row drop.


## Visualization by Political Orientation

In [3]:
if not final_df_extended.empty and not final_df_extended['Date'].isnull().all():
    print("\nGenerating plot by Political Orientation...")
    # Extract year
    final_df_extended['Year'] = final_df_extended['Date'].dt.year

    # Define party orientation order
    party_order = [
        "Centre-left to left",
        "Centre-left",
        "Centre",
        "Centre-right to right",
        "Right to far-right"
    ]
    # Filter out orientations not present in the data to avoid errors
    party_order = [p for p in party_order if p in final_df_extended['Party_orientation'].unique()]

    emotion_columns = ['joy', 'sadness', 'fear', 'disgust', 'anger', 'surprise']
    # Ensure emotion columns exist and are numeric, fillna with 0 for aggregation
    for col in emotion_columns:
        if col not in final_df_extended.columns:
             print(f"Warning: Emotion column '{col}' not found. Skipping.")
             emotion_columns.remove(col)
        else:
             final_df_extended[col] = pd.to_numeric(final_df_extended[col], errors='coerce').fillna(0)

    if emotion_columns:
        # Group by Year and Party Orientation, then sum emotions
        emotion_scores_grouped = final_df_extended.groupby(['Year', 'Party_orientation'])[emotion_columns].sum()

        # Normalize within each year-party group
        # Avoid division by zero if a group has zero total emotions
        group_sums = emotion_scores_grouped.sum(axis=1)
        # Replace 0 sums with 1 to avoid NaN, the result will be 0 anyway
        emotion_scores_normalized = emotion_scores_grouped.div(group_sums.replace(0, 1), axis=0) * 100

        # Reset index for Seaborn
        df_plot = emotion_scores_normalized.reset_index()

        # Map party orientation to their abbreviations (handle potential missing parties)
        party_abbreviations = final_df_extended.groupby("Party_orientation")["Speaker_party"].unique().apply(lambda x: ", ".join(sorted(list(set(x)))))
        df_plot["Party_abbr"] = df_plot["Party_orientation"].map(party_abbreviations).fillna('')

        # Append abbreviations to party names
        df_plot["Party_label"] = df_plot["Party_orientation"] + df_plot["Party_abbr"].apply(lambda x: f" ({x})" if x else "")

        # Define emotion colors
        emotion_colors = {
            'joy': 'gold', 'sadness': 'blue', 'fear': 'purple',
            'disgust': 'green', 'anger': 'red', 'surprise': 'cyan'
        }

        # Custom layout: Adjust based on the number of parties
        num_parties = len(party_order)
        ncols = 3
        nrows = (num_parties + ncols) // ncols # Calculate rows needed, add one extra cell for legend
        fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 4 * nrows), sharex=True, sharey=True)
        axes = axes.flatten()

        # Determine position for legend (e.g., top right)
        legend_pos_index = ncols - 1 
        plot_indices = [i for i in range(nrows * ncols) if i != legend_pos_index]

        # Plot each party
        for i, party in enumerate(party_order):
            if i >= len(plot_indices): break # Avoid index error if fewer parties than plot slots
            ax_idx = plot_indices[i]
            ax = axes[ax_idx]
            party_data = df_plot[df_plot["Party_orientation"] == party]
            party_label = party_data["Party_label"].iloc[0] if not party_data.empty else party

            if party_data.empty:
                ax.set_title(f"{party_label}\n(No Data)")
                ax.set_xlabel("Year")
                ax.set_ylabel("Emotion %")
                continue

            for emotion in emotion_columns:
                if emotion in party_data.columns:
                    sns.lineplot(data=party_data, x="Year", y=emotion, color=emotion_colors.get(emotion, 'black'), ax=ax, legend=False)
                    ax.scatter(party_data['Year'], party_data[emotion], color=emotion_colors.get(emotion, 'black'), s=20)

            ax.set_title(party_label)
            ax.set_xlabel("Year")
            ax.set_ylabel("Emotion %")
            ax.set_ylim(0, max(10, df_plot[emotion_columns].max().max() * 1.1)) # Adjust y-limit based on data

        # Remove unused axes
        for i in range(len(party_order), len(plot_indices)):
             fig.delaxes(axes[plot_indices[i]])
        if legend_pos_index < len(axes): # Ensure legend index is valid
             fig.delaxes(axes[legend_pos_index]) # Remove axis designated for legend

        # Add a single legend in the designated empty space
        handles = [plt.Line2D([0], [0], color=color, lw=3, label=emotion) for emotion, color in emotion_colors.items() if emotion in emotion_columns]
        fig.legend(
            handles=handles,
            bbox_to_anchor=(0.98, 0.98), # Adjust anchor close to top-right of the figure
            loc="upper right",
            fontsize=12,
            title="Emotions",
            title_fontsize=14,
            frameon=True
        )

        plt.tight_layout(rect=[0, 0, 0.9, 1]) # Adjust layout to make space for legend
        plt.suptitle('Emotion Trends by Political Orientation', fontsize=16, y=1.02)
        plt.show()
    else:
        print("Skipping plot: No valid emotion columns found.")
elif final_df_extended.empty:
    print("Skipping plot: DataFrame is empty.")
else:
    print("Skipping plot: Date column is missing or empty after parsing.")

Skipping plot: DataFrame is empty.


## Visualization by Party Status (Coalition/Opposition)

In [4]:
if not final_df_extended.empty and 'Year' in final_df_extended.columns and emotion_columns:
    print("\nGenerating plot by Party Status...")
    # Exclude rows where Speaker_party is "-" (partyless)
    df_filtered = final_df_extended[final_df_extended['Speaker_party'] != '-'].copy()

    if df_filtered.empty:
        print("Skipping plot: No data remaining after filtering out partyless speakers.")
    else:
        # Group by Year and Party, then sum emotions
        emotion_scores_grouped_party = df_filtered.groupby(['Year', 'Speaker_party'])[emotion_columns].sum()

        # Normalize within each year-party group
        group_sums_party = emotion_scores_grouped_party.sum(axis=1)
        emotion_scores_normalized_party = emotion_scores_grouped_party.div(group_sums_party.replace(0, 1), axis=0) * 100

        # Reset index for Seaborn
        df_plot_party = emotion_scores_normalized_party.reset_index()

        # Define emotion colors (reuse from above)
        # emotion_colors = {...}

        # Dynamically order parties based on the count in the Speaker_party column
        party_order_status = df_filtered['Speaker_party'].value_counts().index.tolist()

        # Custom layout: Adjust based on number of parties
        num_parties_status = len(party_order_status)
        ncols_status = 3
        nrows_status = (num_parties_status + ncols_status -1) // ncols_status
        fig_status, axes_status = plt.subplots(nrows_status, ncols_status, figsize=(18, 5 * nrows_status), sharey=True)
        axes_status = axes_status.flatten()

        # Plot each party
        for i, party in enumerate(party_order_status):
            if i >= len(axes_status): break # Should not happen if layout is correct
            ax = axes_status[i]
            party_data = df_plot_party[df_plot_party["Speaker_party"] == party]

            if party_data.empty:
                 ax.set_title(f'{party}\n(No Data)')
                 ax.set_xlabel("Year")
                 ax.set_ylabel("Emotion %")
                 continue

            # Retrieve party orientation (use mode or first if consistent)
            party_orientation = df_filtered[df_filtered["Speaker_party"] == party]["Party_orientation"].mode()
            orientation_label = party_orientation.iloc[0] if not party_orientation.empty else 'N/A'

            # Plot each emotion over time
            for emotion in emotion_columns:
                 if emotion in party_data.columns:
                    sns.lineplot(data=party_data, x="Year", y=emotion, color=emotion_colors.get(emotion, 'black'), ax=ax, legend=False)
                    ax.scatter(party_data['Year'], party_data[emotion], color=emotion_colors.get(emotion, 'black'), s=10)

            # Add faded background colors for coalition/opposition years
            min_year, max_year = int(party_data['Year'].min()), int(party_data['Year'].max())
            for year in range(min_year, max_year + 1):
                party_year_data = df_filtered[(df_filtered['Speaker_party'] == party) & (df_filtered['Year'] == year)]
                if not party_year_data.empty:
                    status_counts = party_year_data['Party_status'].value_counts()
                    if 'Coalition' in status_counts and status_counts.get('Coalition', 0) >= status_counts.get('Opposition', 0):
                        ax.axvspan(year - 0.5, year + 0.5, color='lightgreen', alpha=0.2, zorder=0)
                    elif 'Opposition' in status_counts and status_counts.get('Opposition', 0) > status_counts.get('Coalition', 0):
                        ax.axvspan(year - 0.5, year + 0.5, color='lightcoral', alpha=0.2, zorder=0)
                    # else: # Optional: color for 'Neither' or equal counts
                        # ax.axvspan(year - 0.5, year + 0.5, color='lightgrey', alpha=0.2, zorder=0)

            ax.set_title(f'{party} ({orientation_label})')
            ax.set_xlabel("Year")
            ax.set_ylabel("Emotion %")
            ax.set_ylim(0, 100) # Set fixed y-axis
            ax.set_yticks(range(0, 101, 10))
            ax.set_xlim(df_plot_party['Year'].min() - 0.5, df_plot_party['Year'].max() + 0.5)
            # Set x-ticks every 5 years
            start_year = int(df_plot_party['Year'].min())
            end_year = int(df_plot_party['Year'].max())
            ax.set_xticks(range(start_year, end_year + 1, 5))
            ax.tick_params(axis='x', rotation=45)

        # Remove unused axes
        for i in range(num_parties_status, len(axes_status)):
            fig_status.delaxes(axes_status[i])

        # Add legends
        emotion_handles = [plt.Line2D([0], [0], color=color, lw=3, label=emotion) for emotion, color in emotion_colors.items() if emotion in emotion_columns]
        background_handles = [
            plt.Rectangle((0, 0), 1, 1, fc='lightgreen', alpha=0.3, label='Coalition (>= Opposition)'),
            plt.Rectangle((0, 0), 1, 1, fc='lightcoral', alpha=0.3, label='Opposition (> Coalition)')
            # plt.Rectangle((0, 0), 1, 1, fc='lightgrey', alpha=0.3, label='Other/Equal')
        ]

        leg1 = fig_status.legend(handles=emotion_handles, loc='upper left', bbox_to_anchor=(0.01, 0.98), title="Emotion", fontsize=10, title_fontsize=12)
        fig_status.legend(handles=background_handles, loc='upper left', bbox_to_anchor=(0.15, 0.98), title="Party Status", fontsize=10, title_fontsize=12)
        fig_status.add_artist(leg1) # Add first legend back after second one is created

        plt.tight_layout(rect=[0, 0, 1, 0.95]) # Adjust layout
        fig_status.suptitle('Average Emotions Over Time by Party and Status', fontsize=18, y=0.99)
        plt.show()
else:
     print("Skipping plot by Party Status: DataFrame empty, 'Year' missing, or no emotion columns.")

Skipping plot by Party Status: DataFrame empty, 'Year' missing, or no emotion columns.


### Check Party Orientation Distribution

In [5]:
if not final_df_extended.empty and 'Party_orientation' in final_df_extended.columns:
    print("\nParty Orientation Value Counts:")
    print(final_df_extended['Party_orientation'].value_counts())
else:
    print("\nCannot show Party Orientation counts: DataFrame empty or column missing.")


Cannot show Party Orientation counts: DataFrame empty or column missing.
