# Final Project
#### Big Data Tools - Spring 2024
#### Matt Austen
---
#### This Python Notebook does two things:
1. Prepares and aggregates data
2. Loads data into Neo4j database
---
### 1. Prepare and aggregate data
#### Load data and identify number of rows and columns

In [18]:
import pandas as pd

# Load Voteview.com data in pandas dataframes
members = pd.read_csv('/Users/mattausten/Documents/Rowan_DataAnalytics/04_Spring2024_BigDataTools/FinalProject/HSall_members.csv')  # Member Ideology
parties = pd.read_csv('/Users/mattausten/Documents/Rowan_DataAnalytics/04_Spring2024_BigDataTools/FinalProject/HSall_parties.csv')  # Congressional Parties
votes   = pd.read_csv('/Users/mattausten/Documents/Rowan_DataAnalytics/04_Spring2024_BigDataTools/FinalProject/HSall_votes.csv')      # Members' Votes

# Quick function to print number of rows and cols of pandas dataframe
def getNumRowsCols(df):
    numRows = len(df.index)
    numCols = len(df.columns)
    return numRows,numCols

# Print numbers of rows and cols
print('Member_Ideology      ',getNumRowsCols(members))
print('Congressional_Parties',getNumRowsCols(parties))
print('Members_Votes        ',getNumRowsCols(votes))

Member_Ideology       (50488, 22)
Congressional_Parties (840, 9)
Members_Votes         (25788365, 6)


#### Aggregate dataframes into one dataframe and add some new columns for analysis

In [19]:
import numpy as np

# Add 'party_name', 'party_dim1_median', 'party_dim2_median', 'party_dim1_mean', 'party_dim2_mean' columns
df = pd.merge(members, parties, on=['congress', 'party_code', 'chamber'], how='left')

# Remove rows where chamber=='President' and =='Senate'
df = df[df['chamber'] != 'President']
df = df[df['chamber'] != 'Senate']

# Aggregate 'votes' on congress, chamber, icpsr... then add to df
votes_tmp = votes.groupby(['congress', 'chamber', 'icpsr']).agg({'prob': 'mean', 'rollnumber': 'size'}).reset_index()
df = pd.merge(df, votes_tmp, on=['congress', 'chamber', 'icpsr'], how='left')
df = df.rename(columns={'prob': 'prob_nom', 'rollnumber': 'n_prob_nom'})
df = df.fillna(value=np.nan)

# Output full version to file
df1 = df
df1.to_csv('Hall_custom_all.csv', index=False)

# Output normal version to file
which_congress = [x for x in range(10, 120, 10)]
which_congress.append(117)
which_congress.append(118)
which_congress.append(1)
which_congress.sort()
df2 = df[df['congress'].isin(which_congress)]
df2.to_csv('Hall_custom.csv', index=False)

# Output smaller version to file
df3 = df[df['congress'].isin([117, 118])]
df3.to_csv('Hall_custom_small.csv', index=False)

print(' Final Dataframe (all)  ',getNumRowsCols(df1))
print('*Final Dataframe        ',getNumRowsCols(df2))
print(' Final Dataframe (small)',getNumRowsCols(df3))
df2.head(10)

 Final Dataframe (all)   (40469, 30)
*Final Dataframe         (4759, 30)
 Final Dataframe (small) (898, 30)


Unnamed: 0,congress,chamber,icpsr,state_icpsr,district_code,state_abbrev,party_code,occupancy,last_means,bioname,...,nokken_poole_dim1,nokken_poole_dim2,party_name,n_members,nominate_dim1_median,nominate_dim2_median,nominate_dim1_mean,nominate_dim2_mean,prob_nom,n_prob_nom
0,1,House,379,44,2.0,GA,4000,0.0,1.0,"BALDWIN, Abraham",...,-0.429,-0.817,Anti-Administration,29,0.018,0.092,-0.024379,0.141931,81.644037,109.0
1,1,House,4854,44,1.0,GA,4000,0.0,1.0,"JACKSON, James",...,-0.559,-0.052,Anti-Administration,29,0.018,0.092,-0.024379,0.141931,85.021296,109.0
2,1,House,6071,44,3.0,GA,4000,0.0,1.0,"MATHEWS, George",...,-0.413,-0.232,Anti-Administration,29,0.018,0.092,-0.024379,0.141931,89.116514,109.0
3,1,House,1538,52,6.0,MD,5000,0.0,1.0,"CARROLL, Daniel",...,0.114,-0.779,Pro-Administration,31,0.576,0.004,0.513161,-0.009484,80.277982,109.0
4,1,House,2010,52,3.0,MD,4000,0.0,1.0,"CONTEE, Benjamin",...,-0.093,-0.411,Anti-Administration,29,0.018,0.092,-0.024379,0.141931,81.094495,109.0
5,1,House,3430,52,5.0,MD,5000,0.0,1.0,"GALE, George",...,0.273,-0.962,Pro-Administration,31,0.576,0.004,0.513161,-0.009484,81.474312,109.0
6,1,House,8363,52,2.0,MD,4000,0.0,1.0,"SENEY, Joshua",...,0.279,0.309,Anti-Administration,29,0.018,0.092,-0.024379,0.141931,62.282569,109.0
7,1,House,8693,52,4.0,MD,4000,0.0,1.0,"SMITH, William",...,0.391,0.285,Anti-Administration,29,0.018,0.092,-0.024379,0.141931,62.631193,109.0
8,1,House,8983,52,1.0,MD,4000,0.0,1.0,"STONE, Michael Jenifer",...,0.039,0.12,Anti-Administration,29,0.018,0.092,-0.024379,0.141931,74.60367,109.0
9,1,House,154,3,1.0,MA,5000,0.0,1.0,"AMES, Fisher",...,0.892,0.172,Pro-Administration,31,0.576,0.004,0.513161,-0.009484,81.841667,109.0


---
### 2. Load data into Neo4j database

In [20]:
import config as cfg

# Raw data files that we've put on GitHub
url_custom_all   = 'https://raw.githubusercontent.com/mgausten8/BigDataTools_Spring2024/main/FinalProject/Hall_custom_all.csv'
url_custom       = 'https://raw.githubusercontent.com/mgausten8/BigDataTools_Spring2024/main/FinalProject/Hall_custom.csv'
url_custom_small = 'https://raw.githubusercontent.com/mgausten8/BigDataTools_Spring2024/main/FinalProject/Hall_custom_small.csv'

# File to load into database
data_path = url_custom_small

# Create MEMBER node
query1 = '''
LOAD CSV WITH HEADERS FROM '%s' AS row
WITH row
CREATE (m:Member {id:toInteger(row.icpsr), name:row.bioname})
SET
m.congress = toInteger(row.congress),
m.chamber = row.chamber,
m.state = row.state_abbrev,
m.district_id = toInteger(row.district_code),
m.party = row.party_name,
m.nom_dim1 = toFloat(row.nominate_dim1),
m.nom_dim2 = toFloat(row.nominate_dim2),
m.nom_num_votes = toInteger(row.nominate_number_of_votes),
m.prob_nom = toFloat(row.prob_nom),
m.n_prob_nom = toFloat(row.n_prob_nom);''' % data_path

# Create PARTY node
query2 = '''
LOAD CSV WITH HEADERS FROM '%s' AS row
WITH row
CREATE (p:Party {id:toInteger(row.party_code), name:row.party_name})
SET
p.num_members = toInteger(row.n_members),
p.nom_dim1_med = toFloat(row.nominate_dim1_median),
p.nom_dim2_med = toFloat(row.nominate_dim2_median),
p.nom_dim1_avg = toFloat(row.nominate_dim1_mean),
p.nom_dim2_avg = toFloat(row.nominate_dim2_mean);''' % data_path

# Create STATE node
query3 = '''
LOAD CSV WITH HEADERS FROM '%s' AS row
WITH row
CREATE (s:State {id:toInteger(row.state_icpsr), abbrev:row.state_abbrev});''' % data_path

# Create CHAMBER node
query4 = '''
LOAD CSV WITH HEADERS FROM '%s' AS row
WITH row
CREATE (c:Chamber {name:row.chamber});''' % data_path
    
# Create relationships
query5 = '''
LOAD CSV WITH HEADERS FROM '%s' AS row
WITH row
MATCH (m:Member)
MATCH (p:Party)
MERGE (m)-[:MEMBER_OF]->(p);''' % data_path

query6 = '''
LOAD CSV WITH HEADERS FROM '%s' AS row
WITH row
MATCH (m:Member)
MATCH (s:State)
MERGE (m)-[:REPRESENTED]->(s);''' % data_path

query7 = '''
LOAD CSV WITH HEADERS FROM '%s' AS row
WITH row
MATCH (m:Member)
MATCH (c:Chamber)
MERGE (m)-[:SERVED_IN]->(c);''' % data_path

'''
# Establish database connection and reset it
driver = cfg.getNeo4jConnection()
cfg.clearDatabase(driver)


# Execute queries
cfg.quickCypher(driver, query1, verbose=True)
cfg.quickCypher(driver, query2, verbose=True)
cfg.quickCypher(driver, query3, verbose=True)
cfg.quickCypher(driver, query4, verbose=True)
cfg.quickCypher(driver, query5, verbose=True)
print('Success!')
'''

print('MATCH (n) DETACH DELETE n')
print(query1)
print(query2)
print(query3)
print(query4)
print(query5)
print(query6)
print(query7)

MATCH (n) DETACH DELETE n

LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/mgausten8/BigDataTools_Spring2024/main/FinalProject/Hall_custom_small.csv' AS row
WITH row
CREATE (m:Member {id:toInteger(row.icpsr), name:row.bioname})
SET
m.congress = toInteger(row.congress),
m.chamber = row.chamber,
m.state = row.state_abbrev,
m.district_id = toInteger(row.district_code),
m.party = row.party_name,
m.nom_dim1 = toFloat(row.nominate_dim1),
m.nom_dim2 = toFloat(row.nominate_dim2),
m.nom_num_votes = toInteger(row.nominate_number_of_votes),
m.prob_nom = toFloat(row.prob_nom),
m.n_prob_nom = toFloat(row.n_prob_nom);

LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/mgausten8/BigDataTools_Spring2024/main/FinalProject/Hall_custom_small.csv' AS row
WITH row
CREATE (p:Party {id:toInteger(row.party_code), name:row.party_name})
SET
p.num_members = toInteger(row.n_members),
p.nom_dim1_med = toFloat(row.nominate_dim1_median),
p.nom_dim2_med = toFloat(row.nominate_dim2_median),
p