In [1]:
import os
import glob
import re
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from collections import Counter

# nltk.download('stopwords')
# ================================
# Define Classes for Speaker and Speech
# ================================
    
class Speaker:
    def __init__(self, speakerid, last_name, first_name, chamber, state, gender, party, district, nonvoting):
        self.speakerid = speakerid
        self.last_name = last_name
        self.first_name = first_name
        self.chamber = chamber
        self.state = state
        self.district = district
        self.party = party
        self.nonvoting = nonvoting
        self.speeches = []  # Will hold speech_id's

    def add_speech(self, speech_id):
        self.speeches.append(speech_id)

class Speech:
    def __init__(self, speech_id, chamber, date, word_count, text):
        self.speech_id = speech_id
        # Convert date string to datetime (assumes YYYY-MM-DD format; adjust if needed)
        self.date = datetime.strptime(date, "%Y%m%d")
        self.chamber = chamber
        self.word_count = int(word_count)
        self.text = text
        # These will be computed later
        self.D_sim_av = 0.0
        self.D_sim = 0.0  
        self.R_sim_av = 0.0
        self.R_sim = 0.0  


# ================================
# Parsing Functions
# ================================

def parse_speaker_map(filename):
    """
    Reads a speaker map file that may have either:
      - 11 fields: speakerid|speech_id|lastname|firstname|chamber|state|gender|party|district||nonvoting
      - 10 fields: speakerid|speech_id|lastname|firstname|chamber|state|gender|party|district|nonvoting
    and returns a dict mapping speakerid -> Speaker object.
    """
    speakers = {}
    with open(filename, 'r', encoding='latin-1') as f:
        next(f)  # Skip header/sample line
        for line in f:
            line = line.rstrip('\n')
            if not line:
                continue
            parts = line.split('|')
            # Ensure we have at least 10 parts (positions 0-9)
            if len(parts) < 10:
                print(f"Skipping unexpected format: {line}")
                continue

            speakerid = parts[0]
            speech_id = parts[1]
            lastname = parts[2]
            firstname = parts[3]
            chamber = parts[4]
            state = parts[5]
            gender = parts[6]
            party = parts[7]
            district = parts[8]
            if not district:
                district = "at-large"

            # Check if there is an extra empty field before nonvoting
            if len(parts) == 11:
                nonvoting = parts[10]
            else:
                nonvoting = parts[9]

            # Now create the Speaker object
            if speakerid not in speakers:
                speakers[speakerid] = Speaker(speakerid, lastname, firstname, chamber, state, gender, party, district, nonvoting)
            speakers[speakerid].add_speech(speech_id)
    return speakers


def parse_speeches(filename):
    """
    Reads a speeches file with format:
    speech_id|speech
    (Note: speeches can be multiline but here we assume each entry is a single record separated by newline.
    If your files have a more complex format, you may need to adjust the parsing logic.)
    Returns a dict mapping speech_id -> speech text.
    """
    speeches = {}
    with open(filename, 'r', encoding='utf-8') as f:
        next(f)  # Skip header/sample line
        for line in f:
            line = line.strip()
            if line:
                # Split only on the first delimiter (in case the text contains a "|")
                speech_id, speech_text = line.split('|', 1)
                speeches[speech_id] = speech_text
    return speeches

def parse_description(filename):
    """
    Reads a description file with format:
    speech_id|chamber|date|number_within_file|speaker|first_name|last_name|state|gender|line_start|line_end|file|char_count|word_count
    Returns a dict mapping speech_id -> dictionary with description info.
    """
    descriptions = {}
    with open(filename, 'r', encoding='latin-1') as f:
        next(f)  # Skip header/sample line
        for line in f:
            line = line.strip()
            if line:
                parts = line.split('|')
                speech_id = parts[0]
                chamber = parts[1]
                date = parts[2]
                # parts[13] should be word_count (if indexing starts at 0)
                word_count = parts[13]
                descriptions[speech_id] = {'chamber': chamber, 'date': date, 'word_count': word_count}
    return descriptions

def create_speech_objects(speeches_dict, descriptions):
    """
    Combines the raw speech text (from speeches_dict) with the description info.
    Returns a dict mapping speech_id -> Speech object.
    """
    speech_objects = {}
    for speech_id, text in speeches_dict.items():
        if speech_id in descriptions:
            info = descriptions[speech_id]
            try:
                speech_obj = Speech(speech_id, info['chamber'], info['date'], info['word_count'], text)
                speech_objects[speech_id] = speech_obj
            except Exception as e:
                print(f"Error creating Speech object for speech_id {speech_id}: {e}")
        else:
            print(f"Warning: No description found for speech_id {speech_id}")
    return speech_objects

def get_top_words(bigram_file, n):
    """
    Reads a bigram score file with format:
    phrase|score
    Returns two lists:
      - democrat_words: top n phrases with the most negative scores (associated with Democrats)
      - republican_words: top n phrases with the most positive scores (associated with Republicans)
    """
    phrases = []
    with open(bigram_file, 'r', encoding='latin-1') as f:
        next(f)  # Skip header/sample line.
        for line in f:
            line = line.strip()
            if line:
                phrase, score_str = line.split('|')
                score = float(score_str)
                # Convert spaces to underscores and lowercase the phrase.
                phrase = phrase.lower().replace(" ", "_")
                phrases.append((phrase, score))
    # Sort for Democrat and Republican words.
    phrases_sorted = sorted(phrases, key=lambda x: x[1])
    democrat_words = [phrase for phrase, score in phrases_sorted[:n]]
    phrases_sorted_desc = sorted(phrases, key=lambda x: -x[1])
    republican_words = [phrase for phrase, score in phrases_sorted_desc[:n]]
    return democrat_words, republican_words

# ================================
# Word2Vec Functions
# ================================

def build_corpus(speech_objects):
    """
    Builds a corpus (list of token lists) from the text of all Speech objects.
    For a better analysis you might consider more sophisticated tokenization.
    """
    corpus = []
    for speech in tqdm(speech_objects.values(), desc="Building corpus"):
        tokens = speech.text.split()  # simple whitespace tokenization; consider using nltk.word_tokenize if needed
        corpus.append(tokens)
    return corpus

def train_word2vec(corpus, vector_size=100, window=5, min_count=5, workers=4):
    """
    Trains a Word2Vec model on the provided corpus.
    """
    model = Word2Vec(sentences=corpus, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

def average_vector(model, words):
    """
    Given a list of words and a trained Word2Vec model,
    returns the average vector for all words that appear in the model's vocabulary.
    """
    vecs = []
    for word in words:
        if word in model.wv:
            vecs.append(model.wv[word])
        else:
            # Optionally: you might want to try a lowercased version or other normalization
            pass
    if vecs:
        avg_vec = np.mean(vecs, axis=0)
    else:
        avg_vec = np.zeros(model.vector_size)
    return avg_vec


def compute_similarity_for_speech(speech, model, target_vec, bigram_transformer, p_threshold):
    # Tokenize and transform to bigrams.
    tokens = speech.text.split()  # Use your preferred tokenizer.
    tokens = bigram_transformer[tokens]
    
    sim_scores = []
    for token in tokens:
        if token in model.wv:
            word_vec = model.wv[token]
            # Compute cosine similarity.
            cos_sim = np.dot(word_vec, target_vec) / (np.linalg.norm(word_vec) * np.linalg.norm(target_vec) + 1e-9)
            sim_scores.append(cos_sim)
    if len(sim_scores) == 0:
        return 0.0, 0.0, sim_scores  # Return zeros if no tokens found.

    # Sort the similarity scores in descending order.
    sim_scores.sort(reverse=True)
    
    # Compute the threshold value (p-th percentile).
    threshold_value = np.percentile(sim_scores, p_threshold)
    
    # Only include tokens with similarity >= threshold.
    n_top = max(1, int(len(sim_scores) * p_threshold))
    top_scores = sim_scores[:n_top]
    thresh_avg = sum(top_scores) / len(top_scores)

    # Raw average: average of all token similarity scores.
    raw_avg = np.mean(sim_scores)

    return raw_avg, thresh_avg, sim_scores
    
def compute_similarity_for_phrases(model, input_token, target_token):

        word_vec = model.wv[input_token]
        target_vec = model.wv[target_token]
        cos_sim = np.dot(word_vec, target_vec) / (np.linalg.norm(word_vec) * np.linalg.norm(target_vec) + 1e-9)
        return cos_sim


# ================================
# Panel Dataset Aggregation
# ================================

def create_panel_dataframe(speech_objects, speech_to_speaker):
    """
    Constructs a DataFrame with one row per speech that includes:
      - date of speech
      - district (combination of state and district)
      - D_sim, R_sim scores
      - word_count
    The speech_to_speaker is a dict mapping speech_id -> Speaker object.
    """
    data = []
    for speech in tqdm(speech_objects.values(), desc="Creating Panel Dataframe"):
        if speech.speech_id in speech_to_speaker:
            speaker = speech_to_speaker[speech.speech_id]
            # Construct a district identifier (e.g., "NY-12")
            district = f"{speaker.state}-{speaker.district}"
            data.append({
                "date": speech.date,
                "district": district,
                "D_sim": speech.D_sim,
                "R_sim": speech.R_sim,
                "word_count": speech.word_count
            })
        else:
            print(f"Warning: Speaker not found for speech_id {speech.speech_id}")
    df = pd.DataFrame(data)
    df['date'] = pd.to_datetime(df['date'])
    return df

def compute_weighted_rolling_average(df, window='60D'):
    """
    For each congressional district, compute a weighted rolling average over a window (default 60 days)
    of the D_sim and R_sim scores using word_count as weights.
    Returns a dictionary mapping district -> DataFrame with the rolling averages.
    """
    panel_data = {}
    for district, group in tqdm(df.groupby("district"), desc="Creating Rolling Averages For Districts"):
        # Ensure sorting by date and set the date as index
        group = group.sort_values("date").set_index("date")
        # Define a lambda that computes a weighted average from a Series x,
        # using the corresponding word_count values from the same index.
        # Here we use group.loc[x.index, 'word_count'] as weights.
        group['D_sim_avg'] = group['D_sim'].rolling(window, min_periods=1).apply(
            lambda x: np.average(x, weights=group.loc[x.index, 'word_count']),
            raw=False
        )
        group['R_sim_avg'] = group['R_sim'].rolling(window, min_periods=1).apply(
            lambda x: np.average(x, weights=group.loc[x.index, 'word_count']),
            raw=False
        )
        panel_data[district] = group
    return panel_data

# ================================
# Main Processing Pipeline
# ================================

# Dictionaries to hold all objects across sessions
all_speakers = {}
all_speech_objects = {}
speech_to_speaker = {}  # Maps speech_id to Speaker object

# We will also accumulate bigram words across sessions
democrat_words_all = []
republican_words_all = []

#Global dataframe rows
rows = []

#Hyperparamter: number of top words used in analysis
n = 100

#Parameter: threshold percentile as decimal (for top decile, use 0.1).
p_threshold = 0.1

#Collection of Similarity Scores
sim_scores = []

#Session dataframe array
session_dfs = []

# Process sessions 097 to 114 (inclusive)
for session in tqdm(range(97, 115), desc="Processing sessions"):
    session_str = f"{session:03d}"  # e.g., "097", "098", ... "114"
    
    # Define filenames (adjust paths if needed)
    speaker_map_file = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\gentzkow_work\\hein-daily\\hein-daily\\{session_str}_SpeakerMap.txt"
    speeches_file = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\gentzkow_work\\hein-daily\\hein-daily\\cleaned_speeches_{session_str}.txt"
    descr_file = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\gentzkow_work\\hein-daily\\hein-daily\\descr_{session_str}.txt"
    
    # Parse speaker map and update our global speaker dictionary
    if os.path.exists(speaker_map_file):
        speakers = parse_speaker_map(speaker_map_file)
        all_speakers.update(speakers)
        # Create a mapping from speech_id to speaker object
        for sp in speakers.values():
            for s_id in sp.speeches:
                speech_to_speaker[s_id] = sp
    else:
        print(f"File not found: {speaker_map_file}")
    
    # Parse speeches and their descriptions; then create Speech objects
    if os.path.exists(speeches_file) and os.path.exists(descr_file):
        speeches_dict = parse_speeches(speeches_file)
        descriptions = parse_description(descr_file)
        speech_objs = create_speech_objects(speeches_dict, descriptions)
        all_speech_objects.update(speech_objs)
    else:
        print(f"Missing speeches or description file for session {session_str}")
    
    # Process bigram score file for this session
    speakers = parse_speaker_map(speaker_map_file)
    speeches_dict = parse_speeches(speeches_file)
    descriptions = parse_description(descr_file)
    session_speeches = create_speech_objects(speeches_dict, descriptions)
    
    # Build a mapping from speech_id to Speaker for this session.
    speech_to_speaker = {}
    for sp in speakers.values():
        for s_id in sp.speeches:
            speech_to_speaker[s_id] = sp


# Build the corpus from all speeches and train a Word2Vec model
corpus = build_corpus(all_speech_objects)

# Generate bigrams using gensim's Phrases
print("Generating Bigrams...")
phrases = Phrases(corpus, min_count=5, threshold=10)  # Tune min_count and threshold as needed
bigram_transformer = Phraser(phrases)
corpus_bigrams = [bigram_transformer[sentence] for sentence in corpus]
print("Bigrams Complete")

# Train the Word2Vec model
print("Training Word2VecModel...")
word2vec_model = train_word2vec(corpus_bigrams)
print("Training Complete")



# Compute similarity scores for sessions 097 to 114 (inclusive)
for session in tqdm(range(97, 115), desc="Computing Similarity Scores:"):
    session_str = f"{session:03d}"  # e.g., "097", "098", ... "114"

    bigram_file = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\gentzkow_work\\phrase_partisanship\\partisan_phrases_{session_str}.txt"
    # Get top words for this session from the bigram file.
    dem_words, rep_words = get_top_words(bigram_file, n)
    
    # Optional: deduplicate and keep only the top n words.
    dem_words = list(dict.fromkeys(dem_words))[:n]
    rep_words = list(dict.fromkeys(rep_words))[:n]
    print(f"Top Democrat words for session {session_str}:", dem_words)
    print(f"Top Republican words for session {session_str}:", rep_words)

    # Compute session-specific prototype vectors using the global word2vec model.
    D_vector_session = average_vector(word2vec_model, dem_words)
    R_vector_session = average_vector(word2vec_model, rep_words)
    print("D_vector norm:", np.linalg.norm(D_vector_session))
    print("R_vector norm:", np.linalg.norm(R_vector_session))

    # Temporary lists for token-level similarity scores for this session.
    session_D_token_scores = []
    session_R_token_scores = []
    
    # Compute similarity scores for each speech in this session using the session-specific prototypes.
    print(f"Computing similarity scores for session {session_str}...")
    for speech in session_speeches.values():
        # Compute D_sim scores (for Democrat prototype)
        d_raw, d_thresh, d_tokens = compute_similarity_for_speech(speech, word2vec_model, D_vector_session, bigram_transformer, p_threshold)
        # Compute R_sim scores (for Republican prototype)
        r_raw, r_thresh, r_tokens = compute_similarity_for_speech(speech, word2vec_model, R_vector_session, bigram_transformer, p_threshold)

        # Save the raw averages into the speech object.
        speech.D_sim_av = d_raw  # Raw average similarity for D
        speech.R_sim_av = r_raw  # Raw average similarity for R
        # Save the threshold-based averages.
        speech.D_sim = d_thresh
        speech.R_sim = r_thresh
    
        # Append token scores to global lists.
        session_D_token_scores.extend(d_tokens)
        session_R_token_scores.extend(r_tokens)

    # Truncate the token scores to 3 decimals.
    session_D_trunc = [round(score, 2) for score in session_D_token_scores]
    session_R_trunc = [round(score, 2) for score in session_R_token_scores]
    
    # Compute frequency distributions.
    D_freq = Counter(session_D_trunc)
    R_freq = Counter(session_R_trunc)

    # Get all unique truncated scores.
    unique_scores = sorted(set(session_D_trunc).union(set(session_R_trunc)))
    
    # Build the session panel DataFrame.
    print(f"Building .csv for session {session_str}...")
    data = []
    for speech in session_speeches.values():
        if speech.speech_id in speech_to_speaker:
            speaker = speech_to_speaker[speech.speech_id]
            district = f"{speaker.state}-{speaker.district}"
            data.append({
                "date": speech.date,
                "district": district,
                "D_sim_raw": speech.D_sim_av, 
                "D_sim_thresh": speech.D_sim,
                "R_sim_raw": speech.R_sim_av,
                "R_sim_thresh": speech.R_sim,
                "word_count": speech.word_count,
                "session": session_str
            })
                # Create a row for global dataframe
            row = {
                "date": speech.date,
                "district": district,
                "D_sim_raw": speech.D_sim_av, 
                "D_sim_thresh": speech.D_sim,
                "R_sim_raw": speech.R_sim_av,
                "R_sim_thresh": speech.R_sim,
                "word_count": speech.word_count,
                "session": session_str
            }
            rows.append(row)
    session_df = pd.DataFrame(data)
    session_df['date'] = pd.to_datetime(session_df['date'], format='%Y%m%d')
    session_dfs.append(session_df)
    
    # Save the session-specific panel to CSV.
    session_csv = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\Polarization_Data\\panel_session_{session_str}_n{n}.csv"
    session_df.to_csv(session_csv, index=False)
    print(f"Saved session {session_str} panel data to {session_csv}")

    df = pd.DataFrame(rows)
    
    # Build a DataFrame for the scores distribution (only need one since this is scores on a bigram level).
    dist_rows = []
    for score in unique_scores:
        dist_rows.append({
            "similarity_score": score,
            "D_freq": D_freq.get(score, 0),
            "R_freq": R_freq.get(score, 0)
        })
    df_scores = pd.DataFrame(dist_rows)
    
    # Save to an Excel file specific to this session.
    output_filename = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\Polarization_Data\\similarity_score_distribution_session_{session_str}_n_{n}.csv"
    df_scores.to_csv(output_filename, index=False)
    print(f"Session {session_str} processed and saved to {output_filename}")



# Combine all sessions into a single panel file.
if session_dfs:
    combined_panel = pd.concat(session_dfs, ignore_index=True)
    combined_csv = f"C:\\Users\\Maxfield Evers\\Desktop\\Thesis\\Data\\Polarization_Data\\congressional_panel_all_sessions_n{n}.csv"
    combined_panel.to_csv(combined_csv, index=False)
    print(f"Saved combined panel data to {combined_csv}")
else:
    print("No session data available.")
    
print("Global panel dataset created and saved as 'congressional_panel_data.csv'")

Processing sessions: 100%|████████████████████████████████████████████████████████| 18/18 [01:59<00:00,  6.64s/it]
Building corpus: 100%|███████████████████████████████████████████████| 3866165/3866165 [05:32<00:00, 11635.25it/s]


Generating Bigrams...
Bigrams Complete
Training Word2VecModel...
Training Complete


Computing Similarity Scores::   0%|                                                        | 0/18 [00:00<?, ?it/s]

Top Democrat words for session 097: ['interest_rate', 'high_interest', 'feder_reserv', 'human_right', 'administr_propos', 'el_salvador', 'reserv_board', 'school_lunch', 'oil_compani', 'nuclear_war', 'unemploy_rate', 'minimum_benefit', 'base_mode', 'senior_citizen', 'million_american', 'secur_benefit', 'mx_missil', 'lunch_program', 'saudi_arabia', 'defens_spend', 'nurs_home', 'oil_gas', 'econom_polici', 'farm_incom', 'safeti_net', 'increas_defens', 'budget_propos', 'econom_assumpt', 'make_sens', 'great_depress', 'monetari_polici', 'billion_dollar', 'world_war', 'equal_right', 'medic_care', 'militari_budget', 'habea_corpus', 'vietnam_veteran', 'organ_crime', 'persian_gulf', 'peopl_countri', 'tax_expenditur', 'militari_aid', 'dens_pack', 'war_ii', 'nation_guard', 'militari_spend', 'billion_deficit', 'defens_budget', 'nuclear_arm', 'unemploy_benefit', 'administr_budget', 'arm_control', 'wast_fraud', 'loan_program', 'sinc_great', 'oil_industri', 'tax_break', 'farmer_home', 'hazard_wast', 'i

Computing Similarity Scores::   6%|██▌                                          | 1/18 [06:25<1:49:14, 385.57s/it]

Saved session 097 panel data to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\panel_session_097_n100.csv
Session 097 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_097.csv
Top Democrat words for session 098: ['arm_control', 'nuclear_war', 'mx_missil', 'nuclear_arm', 'year_ago', 'human_right', 'el_salvador', 'hazard_wast', 'bargain_chip', 'unemploy_rate', 'base_mode', 'million_american', 'weapon_system', 'militari_aid', 'nuclear_freez', 'mutual_verifi', 'public_health', 'first_strike', 'militari_spend', 'state_soviet', 'world_war', 'nation_debt', 'air_forc', 'arm_forc', 'econom_develop', 'central_america', 'feder_reserv', 'depart_agricultur', 'feder_deficit', 'administr_propos', 'vietnam_veteran', 'extend_benefit', 'administr_polici', 'trade_deficit', 'minuteman_silo', 'chief_staff', 'reserv_board', 'cruis_missil', 'million_ton', 'rural_america', 'educ_program', 'soviet_govern', 'star_war', 'agen

Computing Similarity Scores::  11%|█████                                        | 2/18 [11:41<1:31:50, 344.40s/it]

Session 098 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_098.csv
Top Democrat words for session 099: ['arm_control', 'star_war', 'trade_deficit', 'mx_missil', 'nuclear_war', 'aid_contra', 'year_ago', 'job_corp', 'nuclear_arm', 'salt_ii', 'defens_depart', 'world_war', 'administr_propos', 'bargain_chip', 'militari_spend', 'latin_america', 'control_agreement', 'feder_reserv', 'nation_guard', 'nuclear_test', 'joint_chief', 'construct_engag', 'chief_staff', 'rural_america', 'central_america', 'american_worker', 'presid_budget', 'weapon_system', 'state_soviet', 'militari_aid', 'civil_rico', 'budget_deficit', 'nation_defens', 'war_ii', 'peopl_countri', 'farm_incom', 'debtor_nation', 'foreign_languag', 'armenian_genocid', 'reserv_board', 'chief_justic', 'defens_contractor', 'million_american', 'distinguish_friend', 'trade_polici', 'base_mode', 'overthrow_govern', 'abm_treati', 'depart_agricultur', 'reduc_deficit', 's

Computing Similarity Scores::  17%|███████▌                                     | 3/18 [16:49<1:21:59, 327.97s/it]

Session 099 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_099.csv
Top Democrat words for session 100: ['persian_gulf', 'contra_aid', 'star_war', 'central_american', 'aid_contra', 'nuclear_wast', 'american_peopl', 'interest_rate', 'presid_budget', 'feder_reserv', 'agent_orang', 'million_american', 'coast_guard', 'nicaraguan_govern', 'depart_energi', 'support_contra', 'deficit_reduct', 'air_forc', 'account_offic', 'american_worker', 'mental_health', 'sea_turtl', 'nurs_home', 'administr_polici', 'health_servic', 'contra_war', 'high_school', 'san_francisco', 'nation_histor', 'nuclear_war', 'foreign_languag', 'assist_contra', 'secretari_interior', 'task_forc', 'first_step', 'educ_program', 'humanitarian_aid', 'militari_aid', 'congression_budget', 'inf_treati', 'debtor_nation', 'reserv_board', 'hate_crime', 'home_care', 'elector_council', 'vietnam_veteran', 'catastroph_ill', 'independ_counsel', 'secretari_gorbachev'

Computing Similarity Scores::  22%|██████████                                   | 4/18 [21:54<1:14:21, 318.70s/it]

Session 100 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_100.csv
Top Democrat words for session 101: ['save_loan', 'bill_right', 'head_start', 'deficit_reduct', 'star_war', 'depart_energi', 'clean_air', 'billion_dollar', 'el_salvador', 'persian_gulf', 'global_warm', 'million_american', 'middl_class', 'budget_deficit', 'militari_aid', 'oil_compani', 'unit_nation', 'public_health', 'veteran_affair', 'khmer_roug', 'puerto_rico', 'eastern_europ', 'presid_budget', 'energi_polici', 'women_infant', 'environment_protect', 'rural_america', 'cold_war', 'tax_break', 'particip_program', 'reduc_deficit', 'rural_hospit', 'task_forc', 'right_act', 'peopl_countri', 'rocki_flat', 'trust_fund', 'account_offic', 'oil_gas', 'central_american', 'budget_offic', 'oil_industri', 'assault_weapon', 'budget_resolut', 'san_francisco', 'state_select', 'congression_budget', 'medic_leav', 'climat_chang', 'coast_guard', 'receiv_request', 'e

Computing Similarity Scores::  28%|████████████▌                                | 5/18 [27:21<1:09:44, 321.88s/it]

Session 101 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_101.csv
Top Democrat words for session 102: ['persian_gulf', 'unemploy_benefit', 'million_american', 'head_start', 'unemploy_rate', 'unemploy_insur', 'american_worker', 'middl_class', 'peopl_countri', 'extend_unemploy', 'natur_gas', 'human_right', 'cold_war', 'billion_dollar', 'go_war', 'medic_leav', 'public_health', 'puerto_rico', 'energi_polici', 'rise_support', 'save_loan', 'world_order', 'depart_energi', 'extend_benefit', 'super_collid', 'energi_effici', 'trade_agreement', 'star_war', 'coast_guard', 'fas_fae', 'fast_track', 'foreign_polici', 'tax_break', 'across_countri', 'higher_educ', 'breast_cancer', 'deficit_reduct', 'clean_air', 'freetrad_agreement', 'job_train', 'arctic_nation', 'particip_program', 'program_expens', 'health_safeti', 'presid_budget', 'renew_energi', 'environment_protect', 'care_system', 'rocki_flat', 'feder_reserv', 'famili_med

Computing Similarity Scores::  33%|███████████████                              | 6/18 [32:22<1:02:59, 314.92s/it]

Session 102 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_102.csv
Top Democrat words for session 103: ['fas_fae', 'deficit_reduct', 'feder_reserv', 'interest_rate', 'univers_coverag', 'care_reform', 'head_start', 'crime_bill', 'peopl_countri', 'human_right', 'american_worker', 'assault_weapon', 'insur_compani', 'presid_aristid', 'year_ago', 'super_collid', 'medic_leav', 'trade_agreement', 'stimulus_packag', 'natur_resourc', 'communiti_polic', 'million_american', 'rise_support', 'vote_nafta', 'famili_medic', 'progress_caucus', 'public_health', 'budget_deficit', 'discretionari_spend', 'earn_incom', 'urg_member', 'econom_develop', 'support_rule', 'vote_right', 'econom_plan', 'minimum_wage', 'want_commend', 'care_coverag', 'support_bill', 'veteran_affair', 'econom_growth', 'friend_side', 'leav_act', 'move_forward', 'across_countri', 'armenian_genocid', 'depart_energi', 'colleagu_support', 'health_reform', 'freetra

Computing Similarity Scores::  39%|██████████████████▎                            | 7/18 [37:27<57:05, 311.44s/it]

Session 103 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_103.csv
Top Democrat words for session 104: ['minimum_wage', 'tax_break', 'nurs_home', 'deficit_reduct', 'head_start', 'student_loan', 'medicar_medicaid', 'increas_minimum', 'welfar_reform', 'budget_resolut', 'million_children', 'break_wealthi', 'peopl_countri', 'million_american', 'secur_trust', 'health_safeti', 'take_away', 'affirm_action', 'american_worker', 'job_train', 'billion_tax', 'nutrit_program', 'pay_tax', 'environment_protect', 'make_sens', 'wealthiest_american', 'poor_children', 'special_interest', 'billion_medicar', 'rural_america', 'tax_loophol', 'rais_minimum', 'campaign_financ', 'senior_citizen', 'polic_offic', 'educ_program', 'higher_educ', 'reconcili_bill', 'partialbirth_abort', 'assault_weapon', 'feder_reserv', 'school_lunch', 'insur_compani', 'go_colleg', 'govern_shutdown', 'trade_deficit', 'educ_train', 'public_servic', 'shut_gover

Computing Similarity Scores::  44%|████████████████████▉                          | 8/18 [42:31<51:29, 308.99s/it]

Session 104 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_104.csv
Top Democrat words for session 105: ['campaign_financ', 'financ_reform', 'public_school', 'minimum_wage', 'bill_right', 'tobacco_compani', 'tax_break', 'insur_compani', 'credit_union', 'budget_resolut', 'credit_card', 'trade_agreement', 'trade_deficit', 'school_construct', 'million_children', 'public_health', 'african_american', 'public_educ', 'partialbirth_abort', 'juvenil_crime', 'manag_care', 'chemic_weapon', 'public_servic', 'head_start', 'fast_track', 'tobacco_industri', 'privat_school', 'lateterm_abort', 'econom_develop', 'rural_communiti', 'emerg_room', 'patient_bill', 'gulf_war', 'across_countri', 'legal_immigr', 'class_size', 'feder_reserv', 'human_right', 'famili_plan', 'american_public', 'school_build', 'deficit_reduct', 'puerto_rico', 'independ_counsel', 'food_safeti', 'mani_year', 'nativ_american', 'famili_farmer', 'nation_guard', '

Computing Similarity Scores::  50%|███████████████████████▌                       | 9/18 [47:35<46:08, 307.65s/it]

Session 105 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_105.csv
Top Democrat words for session 106: ['prescript_drug', 'insur_compani', 'bill_right', 'minimum_wage', 'juvenil_justic', 'credit_card', 'african_american', 'tax_break', 'estat_tax', 'gun_violenc', 'human_right', 'drug_benefit', 'gun_safeti', 'senior_citizen', 'partialbirth_abort', 'secur_medicar', 'campaign_financ', 'across_countri', 'financi_servic', 'school_construct', 'patient_bill', 'budget_resolut', 'head_start', 'mental_health', 'financ_reform', 'come_floor', 'drug_coverag', 'public_school', 'nativ_american', 'emerg_room', 'rural_america', 'drug_compani', 'rise_support', 'domest_violenc', 'oil_compani', 'women_children', 'assault_weapon', 'violenc_women', 'veteran_health', 'older_peopl', 'rural_communiti', 'peopl_countri', 'increas_minimum', 'tobacco_compani', 'econom_develop', 'school_modern', 'high_school', 'million_children', 'trade_defi

Computing Similarity Scores::  56%|█████████████████████████▌                    | 10/18 [52:37<40:47, 305.95s/it]

Session 106 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_106.csv
Top Democrat words for session 107: ['prescript_drug', 'trust_fund', 'homeland_secur', 'secur_medicar', 'insur_compani', 'african_american', 'drug_benefit', 'nation_debt', 'estat_tax', 'american_peopl', 'fast_track', 'credit_card', 'secur_trust', 'tax_break', 'medicar_trust', 'senior_citizen', 'public_health', 'minimum_wage', 'airlin_secur', 'drug_compani', 'fiscal_respons', 'bill_right', 'campaign_financ', 'lost_job', 'come_floor', 'presid_budget', 'missil_defens', 'financ_reform', 'debt_limit', 'nativ_american', 'trade_deficit', 'unemploy_insur', 'drug_industri', 'debt_ceil', 'cost_prescript', 'oil_gas', 'rural_america', 'privat_social', 'airlin_industri', 'unemploy_benefit', 'across_countri', 'rural_communiti', 'peopl_countri', 'project_surplus', 'privat_insur', 'trade_polici', 'american_public', 'energi_effici', 'pay_nation', 'wildlif_refug'

Computing Similarity Scores::  61%|████████████████████████████                  | 11/18 [57:41<35:36, 305.27s/it]

Session 107 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_107.csv
Top Democrat words for session 108: ['head_start', 'homeland_secur', 'first_respond', 'war_iraq', 'american_peopl', 'prescript_drug', 'nation_debt', 'tax_break', 'insur_compani', 'guard_reserv', 'nation_guard', 'million_american', 'minimum_wage', 'senior_citizen', 'men_women', 'drug_compani', 'partialbirth_abort', 'african_american', 'assault_weapon', 'unemploy_benefit', 'militari_famili', 'presid_budget', 'nativ_american', 'support_troop', 'credit_card', 'lost_job', 'budget_deficit', 'overtim_pay', 'tax_credit', 'million_children', 'million_job', 'polic_offic', 'privat_insur', 'left_behind', 'job_lost', 'wealthiest_american', 'year_ago', 'rebuild_iraq', 'iraq_afghanistan', 'manufactur_job', 'fulli_fund', 'privat_medicar', 'children_grandchildren', 'hispan_caucus', 'veteran_health', 'econom_polici', 'middl_class', 'insur_industri', 'start_progra

Computing Similarity Scores::  67%|█████████████████████████████▎              | 12/18 [1:02:45<30:28, 304.81s/it]

Session 108 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_108.csv
Top Democrat words for session 109: ['american_peopl', 'minimum_wage', 'war_iraq', 'tax_break', 'credit_card', 'oil_compani', 'hurrican_katrina', 'african_american', 'nation_debt', 'nation_guard', 'vote_right', 'student_loan', 'budget_resolut', 'right_act', 'million_american', 'estat_tax', 'billion_dollar', 'presid_budget', 'veteran_health', 'first_respond', 'prescript_drug', 'middl_class', 'trade_agreement', 'iraq_afghanistan', 'privat_account', 'member_congress', 'privat_social', 'special_interest', 'natur_disast', 'guard_reserv', 'right_movement', 'tuskege_airmen', 'trade_deficit', 'homeland_secur', 'iraq_war', 'foreign_oil', 'american_worker', 'asian_pacif', 'big_oil', 'increas_minimum', 'civil_war', 'troop_home', 'senior_citizen', 'wealthiest_american', 'budget_deficit', 'oil_gas', 'bring_troop', 'energi_independ', 'abu_ghraib', 'budget_rec

Computing Similarity Scores::  72%|███████████████████████████████▊            | 13/18 [1:07:49<25:22, 304.47s/it]

Session 109 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_109.csv
Top Democrat words for session 110: ['dog_coalit', 'war_iraq', 'african_american', 'american_peopl', 'oil_compani', 'civil_war', 'troop_iraq', 'children_health', 'nobid_contract', 'middl_class', 'mental_health', 'million_american', 'presid_budget', 'colleagu_support', 'iraq_war', 'fiscal_respons', 'global_warm', 'homeland_secur', 'stimulus_packag', 'paygo_rule', 'support_legisl', 'interest_rate', 'renew_energi', 'bring_troop', 'nation_debt', 'troop_home', 'coast_guard', 'big_oil', 'think_import', 'veteran_health', 'iraq_afghanistan', 'al_qaeda', 'million_children', 'trade_polici', 'import_legisl', 'chang_cours', 'attorney_general', 'come_home', 'tax_break', 'public_school', 'support_bill', 'colleagu_join', 'increas_fund', 'oil_industri', 'public_health', 'billion_dollar', 'head_start', 'hiv_aid', 'cop_program', 'end_war', 'blank_check', 'brain_i

Computing Similarity Scores::  78%|██████████████████████████████████▏         | 14/18 [1:12:57<20:22, 305.60s/it]

Session 110 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_110.csv
Top Democrat words for session 111: ['insur_compani', 'african_american', 'care_reform', 'middl_class', 'colleagu_support', 'unemploy_benefit', 'unemploy_insur', 'progress_caucus', 'recoveri_act', 'colleagu_join', 'clean_energi', 'rise_support', 'econom_recoveri', 'afford_health', 'recoveri_reinvest', 'public_option', 'chair_rise', 'american_recoveri', 'tax_break', 'million_american', 'look_forward', 'support_bill', 'reinvest_act', 'join_support', 'dog_coalit', 'extend_unemploy', 'get_sick', 'coast_guard', 'credit_card', 'public_health', 'street_reform', 'support_legisl', 'doughnut_hole', 'prescript_drug', 'import_legisl', 'mental_health', 'scienc_technolog', 'side_aisl', 'status_quo', 'veteran_affair', 'health_reform', 'insur_industri', 'across_countri', 'lost_job', 'support_rule', 'year_ago', 'renew_energi', 'energi_effici', 'move_forward', 'p

Computing Similarity Scores::  83%|████████████████████████████████████▋       | 15/18 [1:18:20<15:32, 310.93s/it]

Session 111 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_111.csv
Top Democrat words for session 112: ['middl_class', 'tax_break', 'care_act', 'afford_care', 'million_american', 'insur_compani', 'student_loan', 'oil_compani', 'clean_air', 'public_health', 'african_american', 'farm_bill', 'reduc_deficit', 'big_oil', 'prescript_drug', 'end_medicar', 'deficit_reduct', 'tea_parti', 'make_america', 'interest_rate', 'tax_credit', 'air_act', 'homeland_secur', 'payrol_tax', 'unemploy_insur', 'transport_bill', 'million_peopl', 'across_countri', 'special_interest', 'clean_energi', 'unemploy_benefit', 'come_togeth', 'doughnut_hole', 'million_job', 'capitol_due', 'take_away', 'job_bill', 'across_america', 'head_start', 'credit_union', 'air_pollut', 'job_act', 'class_famili', 'wealthiest_american', 'tax_loophol', 'women_health', 'vote_right', 'safeti_net', 'air_traffic', 'natur_disast', 'invest_infrastructur', 'public_serv

Computing Similarity Scores::  89%|███████████████████████████████████████     | 16/18 [1:23:28<10:19, 309.89s/it]

Session 112 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_112.csv
Top Democrat words for session 113: ['minimum_wage', 'immigr_reform', 'climat_chang', 'student_loan', 'care_act', 'afford_care', 'middl_class', 'farm_bill', 'interest_rate', 'comprehens_immigr', 'unemploy_insur', 'govern_shutdown', 'vote_right', 'shut_govern', 'african_american', 'head_start', 'tax_break', 'rais_minimum', 'pay_bill', 'million_american', 'tea_parti', 'gun_violenc', 'san_antonio', 'public_health', 'reduc_deficit', 'right_act', 'el_paso', 'econom_develop', 'repeal_afford', 'higher_educ', 'snap_benefit', 'unemploy_benefit', 'deficit_reduct', 'tar_sand', 'homeland_secur', 'nativ_american', 'communiti_across', 'bill_floor', 'open_govern', 'prescript_drug', 'pass_budget', 'equal_pay', 'extend_unemploy', 'across_countri', 'public_servic', 'sea_level', 'immigr_system', 'year_ago', 'energi_effici', 'broken_immigr', 'come_togeth', 'assist_

Computing Similarity Scores::  94%|█████████████████████████████████████████▌  | 17/18 [1:28:33<05:08, 308.46s/it]

Session 113 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_113.csv
Top Democrat words for session 114: ['homeland_secur', 'climat_chang', 'gun_violenc', 'african_american', 'vote_right', 'public_health', 'depart_homeland', 'plan_parenthood', 'afford_care', 'puerto_rico', 'right_act', 'background_check', 'care_act', 'middl_class', 'univers_phoenix', 'student_loan', 'mass_shoot', 'reauthor_exportimport', 'minimum_wage', 'year_ago', 'immigr_reform', 'million_american', 'tax_break', 'tar_sand', 'moment_silenc', 'come_togeth', 'reserv_balanc', 'fossil_fuel', 'fund_depart', 'buy_gun', 'govern_shutdown', 'boko_haram', 'assault_weapon', 'tax_credit', 'energi_effici', 'sea_level', 'coast_guard', 'attorney_general', 'special_interest', 'clean_energi', 'opioid_addict', 'first_respond', 'fund_bill', 'right_movement', 'health_safeti', 'million_peopl', 'san_antonio', 'transport_infrastructur', 'gi_bill', 'public_servic', 'he

Computing Similarity Scores:: 100%|████████████████████████████████████████████| 18/18 [1:33:38<00:00, 312.14s/it]

Session 114 processed and saved to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\similarity_score_distribution_session_114.csv





Saved combined panel data to C:\Users\Maxfield Evers\Desktop\Thesis\Data\Polarization_Data\congressional_panel_all_sessions_n100.csv
Global panel dataset created and saved as 'congressional_panel_data.csv'
