In [1]:
%load_ext autoreload
%autoreload

In [2]:
import os
import torch
import numpy as np
import polars as pl
from model_vae import *
from data_preparation import *
from plotting import create_latent_space_visualizations
from postprocess import process_and_visualize_relative_vectors

## Data Prep

In [3]:
final_data = pl.read_parquet("data/colorado_prepared.parquet")

## Model Setup

In [4]:
# Prepare data for the VAE model
save_dir = "intermediates"
load_data = True
save_data = True

if load_data:
    input_data, target_indices, participation_mask_tensor, num_candidates_per_contest, metadata = load_prepared_data(save_dir)
else:
    input_data, target_indices, participation_mask_tensor, num_candidates_per_contest, metadata = load_and_prepare_voter_data(final_data)
    
    if save_data: save_prepared_data(save_dir, (input_data, target_indices, participation_mask_tensor, num_candidates_per_contest, metadata))


# Create and train the model
# Model parameters
hidden_dim = 64
latent_dim = 2
num_epochs = 30
batch_size = 256
learning_rate = 1e-3
kl_weight = 0.1

pres_race_name = "US PRESIDENT_FEDERAL"
trump_name = "DONALD J TRUMP"
biden_name = "JOSEPH R BIDEN"

model = VoterChoiceVAE(
    num_contests=len(num_candidates_per_contest),
    num_candidates_per_contest=num_candidates_per_contest,
    hidden_dim=hidden_dim,
    latent_dim=latent_dim
)
model = model.to(get_device())

  input_data = torch.load(os.path.join(save_dir, 'input_data.pt'))


In [5]:
# Train with constraint
soft_model = train_voter_vae_constrained(
    model, 
    input_data, 
    target_indices,
    participation_mask_tensor,
    metadata, 
    pres_race_name, 
    trump_name, 
    biden_name,
    num_epochs=num_epochs,
    batch_size=batch_size,
    learning_rate=learning_rate,
    kl_weight=kl_weight,
    constraint_weight=5.0  # Adjust as needed
)

# Apply post-processing to ensure consistent orientation of all latent dimensions
soft_model = post_process_latent_space(soft_model, metadata, pres_race_name, trump_name, biden_name)

Using device: cuda
Applying constraint: DONALD J TRUMP (idx 11) > JOSEPH R BIDEN (idx 20)


Epoch 1/30:   0%|          | 62/12831 [00:46<2:40:59,  1.32it/s, loss=0.8818, recon=0.8817, kl=0.0010]


KeyboardInterrupt: 

In [None]:
# Save only the model parameters
torch.save(soft_model.state_dict(), 'models/voter_choice_vae_state_dict.pt')

In [None]:
# Generate and analyze embeddings
print("\nGenerating voter embeddings...")
voter_embeddings_df = analyze_voter_embeddings(soft_model, input_data, participation_mask_tensor, metadata)

# Display embeddings sample
print("\nSample of voter embeddings:")
print(voter_embeddings_df.head())

# Extract IRT parameters
print("\nExtracting IRT-equivalent parameters...")
discrimination_params, difficulty_params = soft_model.get_irt_parameters()

# Convert to more interpretable format
print("\nSample of discrimination parameters for the first contest:")
first_contest = list(metadata['race_to_idx'].keys())[5]
first_contest_idx = metadata['race_to_idx'][first_contest]
first_contest_discrimination = discrimination_params[first_contest_idx].numpy()

print(f"Contest: {first_contest}")
for i, candidate in enumerate(metadata['candidate_maps'][first_contest_idx].keys()):
    print(f"  Candidate: {candidate}")
    print(f"    Discrimination parameters: {first_contest_discrimination[i]}")

In [None]:
office_idx = metadata["race_to_idx"]["US PRESIDENT_FEDERAL"]
biden_idx = metadata["candidate_maps"][office_idx]["JOSEPH R BIDEN"]

# Now apply reference candidate post-processing for visualization
adjusted_params, reference_info = process_and_visualize_relative_vectors(
    soft_model, input_data, participation_mask_tensor, metadata, final_data, reference_candidates = {office_idx: biden_idx},
    contest_filter = ["PROPOSITION"]
)

## Archive

In [None]:
create_latent_space_visualizations(
    soft_model, input_data, participation_mask_tensor, metadata,
    final_data, num_candidates_per_contest, pres_race_name, trump_name, biden_name
)

In [None]:
# base_data_pl = pl.read_parquet("../cvrs/data/pass2/state=COLORADO/county_name=ADAMS/", columns = ["state", "county_name", "cvr_id", "candidate", "district", "magnitude", "office", "party_detailed"])

base = pl.scan_parquet("../cvrs/data/pass2/state=COLORADO").with_columns((pl.col('office') + '_' + pl.col('district')).alias('race'))

# data_subset = pl.read_parquet("adams_subset.parquet", columns = ["cvr_id", "office", "district", "candidate"])

# drop uncontested elections
small_candidates = (base
    .filter(pl.col("candidate") != "UNDERVOTE")
    .group_by(['race', 'candidate'])
    .agg(pl.len().alias('n'))
    .filter(pl.col('n') <= 20)
    .select(['race', 'candidate'])
    .unique())

# Equivalent of anti_join and subsequent operations
contested_races = (base
    .filter(pl.col("candidate") != "UNDERVOTE")
    # Anti-join equivalent: keep only rows where (race, candidate) is not in small_candidates
    .join(small_candidates, on=['race', 'candidate'], how='anti')
    .select(['race', 'candidate'])
    .unique()
    .group_by('race')
    .agg(pl.len().alias('n'))
    .filter(pl.col('n') > 1)
    .select('race'))

# Rejoin with original data to get all records for these individuals
final_data = (
    base
    .join(
        contested_races,
        on=['race'],
        how="inner"
    )
    # .join(
    #     sampled_data.select(["cvr_id"]),
    #     on=["cvr_id"],
    #     how="inner"
    # )
    .select(
        ["county_name", "cvr_id", "office", "district", "candidate"]
     )
    .filter(
        pl.col("candidate") != "UNDERVOTE"
    )
    .collect()
    .to_pandas()
)

final_data