# Polarization in Congress
**Author:** Jeff Lewis  
**Description:** Provides updated versions of the classic Voteview plots of party mean DW-NOMINATE scores over time. Figures and the underlying data are available for download.

**Python conversion**: [Nick](https://github.com/peppapig450)

This notebook reproduces the [Rmarkdown](https://voteview.com/articles/party_polarization) work that creates and plots polarization data from Voteview. It downloads the NOMINATE dataset, aggregates member-year data into chamber-year polarization metrics, and produces plots similar to the original Voteview plots.

 ## Setup

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text  # Replace ggrepel functionality
from pathlib import Path

# Set the visual style
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_context("notebook")

%matplotlib inline

## Data Loading

We begin by loading the current DW-NOMINATE data from https://voteview.com

In [None]:
# Load the nominate data
nom_data_url = "https://voteview.com/static/data/out/members/HSall_members.csv"
nom_data = pd.read_csv(nom_data_url)

# Display the first few rows and columns to verify the download
nom_data.head()[
    [
        "congress",
        "chamber",
        "party_code",
        "state_icpsr",
        "nominate_dim1",
        "nominate_dim2",
    ]
].head()

## Data Processing

To calculate the mean location of the Northern and Southern Democratic and Republican delegations, we follow *Congressional Quarterly* in defining the "Southern states" as the 11 states of the Confederacy plus Oklahoma and Kentucky. 

Note: Democrats and Republicans are ICPSR party codes 100 and 200 respectively.

In [None]:
# Define Southern states
south = list(range(40, 50)) + [
    51,
    53,
]  # XXX: maybe switch to using something like MAGA vs RINO?

# Filter and process data
filtered_data = nom_data[
    (nom_data["congress"] > 45) & (nom_data["chamber"] != "President")
].copy()
filtered_data["year"] = 2 * (filtered_data["congress"] - 1) + 1789


# Display and verify the filtered data
print(f"Filtered data shape: {filtered_data.shape}")
filtered_data.head()

Now we'll transform the member-year NOMINATE data into chamber-year data on polarization

Note: This is not optimized, for performance this should be vectorized:

In [None]:
# Initialize empty list to store chamber-year data
polar_data = []

# Process by chamber and congress
for (chamber, congress), group in filtered_data.groupby(["chamber", "congress"]):
    year = 2 * (congress - 1) + 1789

    group_party = group["party_code"]
    dem_group = group[group_party == 100]
    rep_group = group[group_party == 200]
    # XXX: independent?

    # Skip if either party has no members
    if any(len(group) == 0 for group in (dem_group, rep_group)):
        continue

    # Calculate metrics
    rep_group_nom_d1 = rep_group["nominate_dim1"]
    dem_group_nom_d1 = dem_group["nominate_dim1"]

    party_mean_diff_d1 = rep_group_nom_d1.mean() - dem_group_nom_d1.mean()

    prop_moderate_d1 = (group["nominate_dim1"].abs() < 0.25).mean()
    prop_moderate_dem_d1 = (dem_group_nom_d1.abs() < 0.25).mean()
    prop_moderate_rep_d1 = (rep_group_nom_d1.abs() < 0.25).mean()

    # Calculate overlap
    overlap_numerator = 0
    if not dem_group.empty:
        max_dem = dem_group["nominate_dim1"].max()
        overlap_numerator += (rep_group["nominate_dim1"] < max_dem).sum()
    if not rep_group.empty:
        min_rep = rep_group["nominate_dim1"].min()
        overlap_numerator += (dem_group["nominate_dim1"] > min_rep).sum()
    total_count = dem_group["nominate_dim1"].count() + rep_group["nominate_dim1"].count()
    overlap = overlap_numerator / total_count if total_count != 0 else np.nan

    # Calculate chamber means
    chamber_mean_d1 = group["nominate_dim1"].mean()
    chamber_mean_d2 = group["nominate_dim2"].mean()

    # Party mean
    dem_mean_d1 = dem_group["nominate_dim1"].mean()
    dem_mean_d2 = dem_group["nominate_dim2"].mean()
    rep_mean_d1 = rep_group["nominate_dim1"].mean()
    rep_mean_d2 = rep_group["nominate_dim2"].mean()

    # Regional party means - with safety checks for empty groups
    north_rep = rep_group[~rep_group["state_icpsr"].isin(south)]
    north_rep_mean_d1 = north_rep["nominate_dim1"].mean() if len(north_rep) else np.nan
    north_rep_mean_d2 = north_rep["nominate_dim2"].mean() if len(north_rep) else np.nan

    south_rep = rep_group[rep_group["state_icpsr"].isin(south)]
    south_rep_mean_d1 = south_rep["nominate_dim1"].mean() if len(south_rep) else np.nan
    south_rep_mean_d2 = south_rep["nominate_dim2"].mean() if len(south_rep) else np.nan

    north_dem = dem_group[~dem_group["state_icpsr"].isin(south)]
    north_dem_mean_d1 = (
        north_dem["nominate_dim1"].mean() if len(north_dem) > 0 else np.nan
    )
    north_dem_mean_d2 = (
        north_dem["nominate_dim2"].mean() if len(north_dem) > 0 else np.nan
    )

    south_dem = dem_group[dem_group["state_icpsr"].isin(south)]
    south_dem_mean_d1 = (
        south_dem["nominate_dim1"].mean() if len(south_dem) > 0 else np.nan
    )
    south_dem_mean_d2 = (
        south_dem["nominate_dim2"].mean() if len(south_dem) > 0 else np.nan
    )

    # Add to results
    polar_data.append(
        {
            "chamber": chamber,
            "congress": congress,
            "year": year,
            "party_mean_diff_d1": party_mean_diff_d1,
            "prop_moderate_d1": prop_moderate_d1,
            "prop_moderate_dem_d1": prop_moderate_dem_d1,
            "prop_moderate_rep_d1": prop_moderate_rep_d1,
            "overlap": overlap,
            "chamber_mean_d1": chamber_mean_d1,
            "chamber_mean_d2": chamber_mean_d2,
            "dem_mean_d1": dem_mean_d1,
            "dem_mean_d2": dem_mean_d2,
            "rep_mean_d1": rep_mean_d1,
            "rep_mean_d2": rep_mean_d2,
            "north_rep_mean_d1": north_rep_mean_d1,
            "north_rep_mean_d2": north_rep_mean_d2,
            "south_rep_mean_d1": south_rep_mean_d1,
            "south_rep_mean_d2": south_rep_mean_d2,
            "north_dem_mean_d1": north_dem_mean_d1,
            "north_dem_mean_d2": north_dem_mean_d2,
            "south_dem_mean_d1": south_dem_mean_d1,
            "south_dem_mean_d2": south_dem_mean_d2,
        }
    )

# Convert to DataFrame
polar_df = pd.DataFrame.from_records(polar_data)

### Explore Polarization data

Let's examine the resulting dataset:


In [None]:
# Display the first few rows
print("Polarization dataset shape:", polar_df.shape)
display(polar_df.head())

# Save to CSV
polarization_csv_path = Path.cwd().parent.parent / "data" / "processed"
polarization_csv_path.mkdir(exist_ok=True, parents=True)
polar_df.to_csv(polarization_csv_path / "polarization_data.csv", index=False)
print("Data saved to 'polarization_data.csv'")

### Data Preperation for Visualization

We need to conver the data to long format for easier plotting:

In [None]:
# Function to convert data to long format for plotting
def to_long_format(df: pd.DataFrame) -> pd.DataFrame:
    # Select columns to melt
    id_vars = ['chamber', 'year', 'congress']
    value_vars = df.columns.difference(id_vars).to_numpy()
    
    # Melt the DataFrame
    long_df = pd.melt(df, id_vars=id_vars, value_vars=value_vars, var_name='score', value_name='value')
    return long_df

# Convert to long format
polar_df_long = to_long_format(polar_df)

# Define party labels
labels = {
    "dem_mean_d1": "DEM",
    "rep_mean_d1": "REP",
    "north_dem_mean_d1": "N. DEM",
    "south_dem_mean_d1": "S. DEM"
}

# Define colors
colors = {"REP": "#c70828", "DEM": "#1372ad", "N. DEM": "#6194F4", "S. DEM": "#81c4e4"}

# Preview long format data
display(polar_df_long.head())

## Visualization Functions

### Party Means Plot Function

In [None]:
def polarized_plot(chamb: str):
    fig, ax = plt.subplots(figsize=(10, 6))

    # Filter data for the given chamber and scores of interest
    plot_scores = ("dem_mean_d1", "rep_mean_d1", "north_dem_mean_d1", "south_dem_mean_d1")
    plot_data = polar_df_long[
        (polar_df_long["chamber"] == chamb)
        & (polar_df_long["score"].isin(plot_scores))
    ].copy()

    # Map score to party labels for readability
    plot_data["party"] = plot_data["score"].map(labels)

    # Plot data for each party
    for party_name, party_data in plot_data.groupby("party"):
        sorted_data = party_data.sort_values("year")
        plt.plot(
            sorted_data["year"],
            sorted_data["value"],
            marker="o",
            markersize=3,
            label=party_name,
            color=colors.get(str(party_name)),
        )

    # Annotate the party name at the last year's data point
    last_years = plot_data.groupby("party")["year"].max()
    texts = []
    for party in last_years.index:
        # Filter the row for the party's last year
        last_data = plot_data[
            (plot_data["party"] == party) & (plot_data["year"] == last_years[party])
        ]
        x = last_data["year"].iloc[0]
        y = last_data["value"].iloc[0]

        # Label slightly to the right of the last data point
        text = ax.text(
            x + 1, y, party, color=colors.get(party), ha="left", va="center", fontsize=9
        )
        texts.append(text)

    # Adjust text positions to a void overlaps
    adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

    # Configure the axis labels and title
    ax.set_xlabel("Year")
    ax.set_ylabel("Liberal-Conservative")

    # Set x-axis ticks from 1880 to the maximum year in the data, with a step of 8
    max_year = int(plot_data['year'].max())
    xticks = list(range(1880, max_year + 8, 8))
    ax.set_xticks(xticks)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
    ax.margins(x=0.1)

    ax.grid(visible=True, linestyle='--', alpha=0.6)

    # Remove top and right borders for cleaner look
    sns.despine(ax=ax)

    fig.tight_layout()

    # Save the figure in both PNG and PDF formats
    output_dir = Path.cwd().parent.parent / "outputs"
    output_dir.mkdir(parents=True, exist_ok=True)
    fig.savefig(output_dir / f"voteview_{chamb.lower()}_party_means.png", dpi=300)
    fig.savefig(output_dir / f"voteview_{chamb.lower()}_party_means.pdf")

    return fig, ax

### Party Difference Plot Function

In [None]:
def party_diff_plot():
    # Create figure and axis objects
    fig, ax = plt.subplots(figsize=(10, 6))

    # Filter data for the party mean difference score
    diff_data = polar_df_long[polar_df_long['score'] == 'party_mean_diff_d1'].copy()

    # Define colors for each chamber
    chamber_colors = {"House": "#c70828", "Senate": "#1372ad"}

    # Plot data for each chamber
    for chamber_name, chamber_data in diff_data.groupby('chamber'):
        sorted_data = chamber_data.sort_values('year')
        ax.plot(sorted_data['year'], sorted_data['value'], 
                marker='o', markersize=3, label=chamber_name,
                color=chamber_colors.get(str(chamber_name)))

    # Reserve extra space on the x-axis by setting margins
    ax.margins(x=0.1)
    # Force a draw so the new margins update the x-axis limits
    fig.canvas.draw()
    # Get the leftmost x-limit (margin start)
    left_lim = ax.get_xlim()[0]
    # Determine an offset so the text isn't flush against the edge
    offset = 5

    # Annotate the first data point for each chamber
    first_years = diff_data.groupby('chamber')['year'].min()
    texts = []
    for chamber in first_years.index:
        first_data = diff_data[(diff_data['chamber'] == chamber) & 
                               (diff_data['year'] == first_years[chamber])]
        y = first_data['value'].iloc[0]
        text = ax.text(left_lim + offset, y, chamber, 
                       color=chamber_colors.get(chamber), 
                       ha='right', va='center', fontsize=9)
        texts.append(text)

    # Adjust text positions to avoid overlapping annotations
    adjust_text(texts, ax=ax, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

    # Configure axis labels and title
    ax.set_xlabel('Year')
    ax.set_ylabel('Distance between party means')

    # Set x-axis ticks: from 1880 to the maximum year in the dataset (step 8)
    max_year = int(diff_data['year'].max())
    xticks = list(range(1880, max_year + 8, 8))
    ax.set_xticks(xticks)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

    ax.grid(True, linestyle='--', alpha=0.6)
    ax.set_title('Liberal-Conservative Partisan Polarization by Chamber')

    # Remove top and right borders
    sns.despine(ax=ax)

    # fig.tight_layout()

    output_dir = Path.cwd().parent.parent / "outputs"
    output_dir.mkdir(parents=True, exist_ok=True)
    # Save the figure to the specified output directory in both PNG and PDF formats
    fig.savefig(output_dir / "voteview_party_mean_diff.png", dpi=300)
    fig.savefig(output_dir / "voteview_party_mean_diff.pdf")

    return fig, ax

### House Party Means Plot

In [None]:
# Generate and display House plot
house_plot = polarized_plot("House")
plt.show()

### Senate Party Means Plot

In [None]:
# Generate and display Senate plot
senate_plot = polarized_plot("Senate")
plt.show()

### Party Difference Plot

In [None]:
# Generate and display party difference plot
diff_plot = party_diff_plot()
plt.show()