In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from transformers import BertTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sia = SentimentIntensityAnalyzer()

In [13]:
# Load data
df = pd.read_csv('../data/historical_votes.csv')  # Your rollcalls CSV

# Clean
df.fillna('Unknown', inplace=True)

# Derive 'passed' from vote_result (1=Passed, 0=otherwise)
df['passed'] = df['vote_result'].apply(lambda x: 1 if x == 'Passed' else 0)

# NLP for vote_desc (as proxy for bill_text; expand later)
df['text_tokens'] = df['vote_desc'].apply(lambda x: tokenizer.encode(x, max_length=512, truncation=True)) if 'vote_desc' in df else []

# Balance dataset (use features like nominate_mid_1 as ideology proxy for party)
X = df.drop('passed', axis=1).select_dtypes(include=['number'])  # Numeric features only (e.g., yea_count, nominate_mid_1)
y = df['passed']
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

# Save
pd.concat([X_res, y_res], axis=1).to_csv('../data/processed.csv', index=False)

# Optional: If adding public_comments.csv sentiment (from earlier)
# df_comments = pd.read_csv('data/public_comments.csv')
# df_comments['sentiment'] = df_comments['comment'].apply(lambda x: sia.polarity_scores(x)['compound'])
# avg_sentiment = df_comments['sentiment'].mean()  # Add as feature to X if linking to bills
# df_comments.to_csv('data/processed_comments.csv', index=False)

# Ethical: Comments are public; averaged to anonymize; checked for bias in commenter demographics if metadata available

  df.fillna('Unknown', inplace=True)
