# Final Project
#### Big Data Tools - Spring 2024
#### Matt Austen
---
#### Load data and identify number of rows and columns

In [1]:
import pandas as pd
import config as cfg

LOAD_VOTEVIEW = True

# Load Voteview.com data in pandas dataframes
if LOAD_VOTEVIEW:
    path_members = 'https://voteview.com/static/data/out/members/HSall_members.csv'
    path_parties = 'https://voteview.com/static/data/out/parties/HSall_parties.csv'
    path_votes   = 'https://voteview.com/static/data/out/votes/HSall_votes.csv'
else:        
    # Load files locally if website is unavailable
    path_members = '/Users/mattausten/Documents/Rowan_DataAnalytics/04_Spring2024_BigDataTools/FinalProject/HSall_members.csv'
    path_parties = '/Users/mattausten/Documents/Rowan_DataAnalytics/04_Spring2024_BigDataTools/FinalProject/HSall_parties.csv'
    path_votes   = '/Users/mattausten/Documents/Rowan_DataAnalytics/04_Spring2024_BigDataTools/FinalProject/HSall_votes.csv'
members = pd.read_csv(path_members)                  # Member Ideology
parties = pd.read_csv(path_parties)                  # Congressional Parties
votes   = pd.read_csv(path_votes, low_memory=False)  # Members' Votes

# Print numbers of rows and cols
print('Member_Ideology      ',cfg.getNumRowsCols(members))
print('Congressional_Parties',cfg.getNumRowsCols(parties))
print('Members_Votes        ',cfg.getNumRowsCols(votes))

Member_Ideology       (50488, 22)
Congressional_Parties (840, 9)
Members_Votes         (25788365, 6)


#### Aggregate dataframes into one dataframe and add some new columns for analysis

In [None]:
import numpy as np

# Add 'party_name', 'party_dim1_median', 'party_dim2_median', 'party_dim1_mean', 'party_dim2_mean' columns
df = pd.merge(members, parties, on=['congress', 'party_code', 'chamber'], how='left')

# Remove rows where chamber=='President'
df = df[df['chamber'] != 'President']

# Aggregate 'votes' on congress, chamber, icpsr... then add to df
votes_tmp = votes.groupby(['congress', 'chamber', 'icpsr']).agg({'prob': 'mean', 'rollnumber': 'size'}).reset_index()
df = pd.merge(df, votes_tmp, on=['congress', 'chamber', 'icpsr'], how='left')
df = df.rename(columns={'prob': 'prob_nom', 'rollnumber': 'n_prob_nom'})
df = df.fillna(value=np.nan)

# Output full version to file
df1 = df
df1.to_csv('HSall_custom.csv', index=False)

# Output smaller version to file
df2 = df[df['congress'].isin([117, 118])]
df2.to_csv('HSall_custom_small.csv', index=False)

print('Final Dataframe',cfg.getNumRowsCols(df1))
df1.head(10)

Final Dataframe (50361, 30)
