In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from transformers import BertTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sia = SentimentIntensityAnalyzer()

# Load data with UTF-8 (use ../ to reach root data/)
df_votes = pd.read_csv('../data/historical_votes.csv', encoding='utf-8')
df_actions = pd.read_csv('../data/bill_actions.csv', encoding='utf-8') if os.path.exists('../data/bill_actions.csv') else pd.DataFrame(columns=['bill_id', 'date', 'text'])
df_comments = pd.read_csv('../data/public_comments.csv', encoding='utf-8') if os.path.exists('../data/public_comments.csv') else pd.DataFrame(columns=['comment'])
df_bills = pd.read_csv('../data/sample_bill.csv', encoding='utf-8')

# Clean votes
df_votes.fillna('Unknown', inplace=True)
df_votes['passed'] = df_votes['vote_result'].apply(lambda x: 1 if x == 'Passed' else 0)

# Derive time-series: Cumulative rolls
df_votes['date'] = pd.to_datetime(df_votes['date'], errors='coerce')
df_votes = df_votes.sort_values('date')
df_votes['cumulative_rolls'] = df_votes.groupby('congress')['rollnumber'].cumsum()

# Merge actions (by bill_id if available; else average)
if not df_actions.empty:
    df_actions['date'] = pd.to_datetime(df_actions['date'], errors='coerce')
    action_counts = df_actions.groupby('bill_id').size().reset_index(name='action_count')
    df_bills = df_bills.merge(action_counts, on='bill_id', how='left').fillna(0)
    # Add to votes (proxy merge on date if bill_id not in votes)
    df_votes = df_votes.merge(df_actions.groupby(df_actions['date'].dt.date)['bill_id'].count().reset_index(name='action_count'), how='left', left_on=df_votes['date'].dt.date, right_on='date').fillna(0)
else:
    df_votes['action_count'] = 0

# NLP for vote_desc
df_votes['text_tokens'] = df_votes['vote_desc'].apply(lambda x: tokenizer.encode(str(x), max_length=512, truncation=True)) if 'vote_desc' in df_votes else []

# Sentiment
if not df_comments.empty:
    df_comments['sentiment'] = df_comments['comment'].apply(lambda x: sia.polarity_scores(str(x))['compound'])
    avg_sentiment = df_comments['sentiment'].mean()
    df_votes['sentiment'] = avg_sentiment
else:
    df_votes['sentiment'] = 0.0

# Balance dataset
numeric_cols = ['yea_count', 'nay_count', 'nominate_mid_1', 'nominate_mid_2', 'cumulative_rolls', 'action_count', 'sentiment']
X = df_votes[numeric_cols].fillna(0)
y = df_votes['passed']
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

# Save with tiled dates (repeat original dates to match resampled length)
processed_df = pd.DataFrame(X_res, columns=numeric_cols)
processed_df['passed'] = y_res
original_dates = df_votes['date'].values
tiled_dates = np.tile(original_dates, (len(processed_df) // len(original_dates)) + 1)[:len(processed_df)]
processed_df['date'] = pd.to_datetime(tiled_dates)
processed_df.to_csv('../data/processed.csv', index=False, encoding='utf-8')

  df_votes.fillna('Unknown', inplace=True)
