In [None]:
import pandas as pd
import numpy as np

In [None]:
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

#### indiv22

In [None]:
# Dataset description:
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20Individual%20Contribution%20Data.htm
columns_campfin22_indivs22 = ['cycle', 'fectransid', 'contribid', 'contrib_last', 'contrib_first', 'recipid', 'orgname', 
                    'ultorg', 'realcode', 'date', 'amount', 'street', 'city', 'state',
                    'zip', 'recipcode', 'type', 'cmteid', 'otherid', 'gender', 'microfilm',
                    'occupation', 'employer', 'source']

columns_campfin22_indivs22 = dict(enumerate(columns_campfin22_indivs22))

# This dataset is huge, and crashes my computer. For now, limiting to 1000000 rows
# until a better way is found to trim off this dataset to only the last cycle or two.
df_indivs22 = pd.read_csv('../../../data/open_secrets/CampaignFin22/indivs22.csv', nrows=1000000, on_bad_lines='skip', sep=',', header=None, na_values=['N/A', 'NA'], encoding='ISO-8859-1')
df_indivs22 = df_indivs22.rename(columns=columns_campfin22_indivs22)

# Get rid of the pipes on both sides of the data.
df_indivs22 = df_indivs22.applymap(lambda x: x.replace('|', '') if type(x) == str else x)

# Other cleanup.
df_indivs22['cycle'] = df_indivs22['cycle'].apply(lambda x: int(x))
df_indivs22['fectransid'] = df_indivs22['fectransid'].apply(lambda x: int(x))
df_indivs22['date'] = df_indivs22['date'][:9]
df_indivs22['date'] = pd.to_datetime(df_indivs22['date'], format='%m/%d/%Y')

df_indivs22.head()

#### cmtes22

In [None]:
# Dataset description:
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20for%20Cmtes.htm
columns_campfin22_cmtes22 = ['cycle', 'cmteid', 'pacshort', 'affiliate', 'ultorg', 'recipid', 
                    'recipcode', 'feccandid', 'party', 'primcode', 'source', 'sensitive',
                    'foreign', 'active']

columns_campfin22_cmtes22 = dict(enumerate(columns_campfin22_cmtes22))

df_cmtes22 = pd.read_csv('../../../data/open_secrets/CampaignFin22/cmtes22.csv', on_bad_lines='skip', sep=',', header=None, na_values=['N/A', 'NA'])
df_cmtes22 = df_cmtes22.rename(columns=columns_campfin22_cmtes22)

# Get rid of the pipes on both sides of the data.
df_cmtes22 = df_cmtes22.applymap(lambda x: x.replace('|', '') if type(x) == str else x)

# Other cleanup.
df_cmtes22['cycle'] = df_cmtes22['cycle'].apply(lambda x: int(x))
df_cmtes22['foreign'] = df_cmtes22['foreign'].apply(lambda x: int(x))
df_cmtes22['active'] = df_cmtes22['active'].apply(lambda x: int(x))

df_cmtes22.head()

#### cmtes527

In [None]:
# Dataset description:
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20527%20Cmtes.htm
columns_cmtes527 = ['cycle', 'rpt', 'ein', 'crp527name', 'affiliate', 'ultorg', 
                    'recipcode', 'cmteid', 'cid', 'eccmteid', 'party', 
                    'primcode', 'source', 'ffreq', 'ctype', 'csource', 'viewpt',
                    'comments', 'state']

columns_cmtes527 = dict(enumerate(columns_cmtes527))

df_cmtes527 = pd.read_csv('../../../data/open_secrets/527/cmtes527.csv', on_bad_lines='skip', sep=',', header=None, na_values=['N/A', 'NA'])
df_cmtes527 = df_cmtes527.rename(columns=columns_cmtes527)

# Get rid of the pipes on both sides of the data.
df_cmtes527 = df_cmtes527.applymap(lambda x: x.replace('|', '') if type(x) == str else x)

# Other cleanup.
df_cmtes527['cycle'] = df_cmtes527['cycle'].apply(lambda x: int(x))
df_cmtes527['ein'] = df_cmtes527['ein'].apply(lambda x: int(x))

df_cmtes527.head()

#### cands22

In [None]:
# Dataset description:
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20Candidates%20Data.htm
columns_campfin22_cands22 = ['cycle', 'feccandid', 'cid', 'firstlastp', 'party', 'distidrunfor', 
                    'distidcurr', 'currcand', 'cyclecand', 'crpico', 'recipcode', 
                    'nopacs']

columns_campfin22_cands22 = dict(enumerate(columns_campfin22_cands22))

df_cands22 = pd.read_csv('../../../data/open_secrets/CampaignFin22/cands22.csv', on_bad_lines='skip', sep=',', header=None, na_values=['N/A', 'NA'])
df_cands22 = df_cands22.rename(columns=columns_campfin22_cands22)

# Get rid of the pipes on both sides of the data.
df_cands22 = df_cands22.applymap(lambda x: x.replace('|', '') if type(x) == str else x)

# Other cleanup.
df_cands22['cycle'] = df_cands22['cycle'].apply(lambda x: int(x))

df_cands22.head()

#### pac_other22

In [None]:
len(df_pac_other22)

In [None]:
# Dataset description:
# https://www.opensecrets.org/resources/datadictionary/Data%20Dictionary%20PAC%20to%20PAC%20Data.htm
columns_campfin22_pac_other22 = ['cycle', 'fecrecno', 'filerid', 'donorcmte', 'contriblendtrans', 'city', 'state', 
                            'zip', 'fecoccemp', 'primcode', 'date', 'amount', 'recipid', 'party', 'otherid',
                            'recipcode', 'recipprimcode', 'amend', 'report', 'pg', 'microfilm', 'type',
                            'realcode', 'source']

columns_campfin22_pac_other22 = dict(enumerate(columns_campfin22_pac_other22))

df_pac_other22 = pd.read_csv('../../../data/open_secrets/CampaignFin22/pac_other22.csv', on_bad_lines='skip', sep=',', header=None, na_values=['N/A', 'NA'])
df_pac_other22 = df_pac_other22.rename(columns=columns_campfin22_pac_other22)

# Get rid of the pipes on both sides of the data.
df_pac_other22 = df_pac_other22.applymap(lambda x: x.replace('|', '') if type(x) == str else x)

# Other cleanup.
df_pac_other22['cycle'] = df_pac_other22['cycle'].apply(lambda x: int(x))
df_pac_other22['fecrecno'] = df_pac_other22['fecrecno'].apply(lambda x: int(x))

df_pac_other22['date'] = pd.to_datetime(df_pac_other22['date'], format='%m/%d/%Y', errors='coerce')
df_pac_other22 = df_pac_other22.dropna(subset=['date'])

df_pac_other22['microfilm'] = df_pac_other22['microfilm'].fillna(0)
df_pac_other22['microfilm'] = df_pac_other22['microfilm'].apply(lambda x: 0 if x == '' else int(x))
                                                                                                
df_pac_other22.head()                                                                                                

### PACs to Candidates

In [None]:
# First, join df1 and df2 on 'id' with an inner join
df_cmtes_merged = pd.merge(df_cmtes22, df_cmtes527, on='cmteid', how='inner')
df_cmtes_merged.head()

# # Then, join the result with df3 on 'id' with an inner join
df_cmtes_cands = pd.merge(df_cmtes_merged, df_cands22, on='cid', how='inner')
df_cmtes_cands.head()

In [None]:
df_network = df_cmtes_cands[['cmteid', 'pacshort', 'cid']].drop_duplicates()
df_network = df_network.reset_index(drop=True)

df_network.head(100)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph from the DataFrame
G1 = nx.from_pandas_edgelist(df_network, source='cid', target='cmteid')

# Draw the network
plt.figure(figsize=(8, 6))
pos = nx.spring_layout(G1)  # Layout for visualization
nx.draw(G1, pos, with_labels=True, node_size=700, node_color='lightblue', font_size=7)

plt.show()

### PACs to PACs

In [None]:
df_pac_other22_temp = df_pac_other22[['filerid', 'recipid', 'party']]
df_pac_other22_network = df_pac_other22_temp.drop_duplicates()
df_pac_other22_network['party'] = df_pac_other22_network['party'].replace(['', ' '], 'X').fillna('X').str.strip().replace('', 'X').fillna('X')

In [None]:
len(df_pac_other22_temp.duplicated())

In [None]:
len(df_pac_other22_network)

In [None]:
# df_pac_other22_network[:50]

In [None]:
df_pac_other22_network

In [None]:
# df_pac_other22_network[:250]

# Filter rows where column1 is not equal to column2
df_pac_other22_network_filtered = df_pac_other22_network[df_pac_other22_network['filerid'] != df_pac_other22_network['recipid']]
df_pac_other22_network_filtered[~df_pac_other22_network_filtered['filerid'].isna() & (df_pac_other22_network_filtered['filerid'] != '')]
df_pac_other22_network_filtered[~df_pac_other22_network_filtered['recipid'].isna() & (df_pac_other22_network_filtered['recipid'] != '')]

df_pac_other22_network_filtered.head()

In [None]:
import matplotlib.cm as cm
import textwrap

# Wrap long labels.
def split_label(label, max_length=25):
    return textwrap.fill(label, width=max_length)

# Duplicate edges as weights. No... should be dollar amount.
df_edges = df_pac_other22_network_filtered[:7500].groupby(['filerid', 'recipid']).size().reset_index(name='weight')

# Make graph with weighted edges.
G2 = nx.from_pandas_edgelist(df_edges, source='filerid', target='recipid', edge_attr='weight', create_using=nx.DiGraph())
in_degrees = dict(G2.in_degree())
nodes_sorted = sorted(G2.nodes, key=in_degrees.get, reverse=True)

# Degree-based node size.
node_sizes_sorted = [in_degrees[node] * 10 for node in nodes_sorted]

# Top n degree labels.
label_dict = pd.Series(df_cmtes22.pacshort.values, index=df_cmtes22.cmteid).to_dict()
top_n_nodes = sorted(in_degrees, key=in_degrees.get, reverse=True)[:20]
top_n_labels = {node: split_label(label_dict[node].upper()) for node in top_n_nodes if node in label_dict}

# Party node color.
df_pac_other22['filerid'] = df_pac_other22['filerid'].fillna('X')
df_pac_other22['filerid'] = df_pac_other22['filerid'].astype(str)
color_map = {'D': 'blue', 'R': 'red', 'G': 'green', 'I': 'brown', 'X': 'gray'}
node_colors = {row['filerid']: color_map.get(row['party'], 'gray') for _, row in df_pac_other22.iterrows()}
node_color_list_sorted = [node_colors.get(str(node), 'gray') for node in nodes_sorted]

# Party edge color.
in_degree_values = [in_degrees[target] for _, target in G2.edges]
norm = plt.Normalize(vmin=min(in_degree_values), vmax=max(in_degree_values))
# cmap = cm.Blues
edge_colors = [node_colors.get(str(source), 'gray') for source, _ in G2.edges]

# Node spacing.
initial_pos = {node: (0.5, 0.5) for node in top_n_nodes}
pos = nx.spring_layout(G2, pos=initial_pos, fixed=top_n_nodes, k=1.25, scale=1.5, iterations=200)

edge_widths = [G2[u][v]['weight'] * 0.1 for u, v in G2.edges]

# Draw.
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G2)
nx.draw_networkx_edges(G2, pos, width=edge_widths, edge_color=edge_colors) #0.15 width
nx.draw_networkx_nodes(G2, pos, nodelist=nodes_sorted, node_color=node_color_list_sorted, node_size=node_sizes_sorted, edgecolors='white', linewidths=1)
nx.draw_networkx_labels(G2, pos, labels=top_n_labels, font_size=5, font_color="black", bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.4'))
plt.show()

In [None]:
len(G2.nodes)

In [None]:
global_clustering_coefficient = nx.average_clustering(G2)
print("Global Clustering Coefficient:", global_clustering_coefficient)

In [None]:
largest_scc = max(nx.strongly_connected_components(G2), key=len)
G2_largest_scc = G2.subgraph(largest_scc)
avg_path_length = nx.average_shortest_path_length(G2_largest_scc)
avg_path_length

In [None]:
df_pac_other22_network_filtered_dems = df_pac_other22_network_filtered[df_pac_other22_network_filtered['party'] == 'D']

# Wrap long labels.
def split_label(label, max_length=25):
    return textwrap.fill(label, width=max_length)

# Duplicate edges as weights. No... should be dollar amount.
df_edges = df_pac_other22_network_filtered_dems[:7500].groupby(['filerid', 'recipid']).size().reset_index(name='weight')

# Make graph with weighted edges.
G3 = nx.from_pandas_edgelist(df_edges, source='filerid', target='recipid', edge_attr='weight', create_using=nx.DiGraph())
in_degrees = dict(G3.in_degree())
nodes_sorted = sorted(G3.nodes, key=in_degrees.get, reverse=True)

# Degree-based node size.
node_sizes_sorted = [in_degrees[node] * 10 for node in nodes_sorted]

# Top n degree labels.
label_dict = pd.Series(df_cmtes22.pacshort.values, index=df_cmtes22.cmteid).to_dict()
top_n_nodes = sorted(in_degrees, key=in_degrees.get, reverse=True)[:20]
top_n_labels = {node: split_label(label_dict[node].upper()) for node in top_n_nodes if node in label_dict}

# Party node color.
df_pac_other22['filerid'] = df_pac_other22['filerid'].fillna('X')
df_pac_other22['filerid'] = df_pac_other22['filerid'].astype(str)
color_map = {'D': 'blue', 'R': 'red', 'G': 'green', 'I': 'brown', 'X': 'gray'}
node_colors = {row['filerid']: color_map.get(row['party'], 'gray') for _, row in df_pac_other22.iterrows()}
node_color_list_sorted = [node_colors.get(str(node), 'gray') for node in nodes_sorted]

# Party edge color.
in_degree_values = [in_degrees[target] for _, target in G3.edges]
norm = plt.Normalize(vmin=min(in_degree_values), vmax=max(in_degree_values))
# cmap = cm.Blues
edge_colors = [node_colors.get(str(source), 'gray') for source, _ in G3.edges]

# Node spacing.
initial_pos = {node: (0.5, 0.5) for node in top_n_nodes}
pos = nx.spring_layout(G3, pos=initial_pos, fixed=top_n_nodes, k=1.25, scale=1.5, iterations=200)

edge_widths = [G3[u][v]['weight'] * 0.1 for u, v in G3.edges]

# Draw.
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G3)
nx.draw_networkx_edges(G3, pos, width=edge_widths, edge_color=edge_colors) #0.15 width
nx.draw_networkx_nodes(G3, pos, nodelist=nodes_sorted, node_color=node_color_list_sorted, node_size=node_sizes_sorted, edgecolors='white', linewidths=1)
nx.draw_networkx_labels(G3, pos, labels=top_n_labels, font_size=5, font_color="black", bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.4'))
plt.show()

In [None]:
len(G3.nodes)

In [None]:
global_clustering_coefficient = nx.average_clustering(G3)
print("Global Clustering Coefficient:", global_clustering_coefficient)

In [None]:
num_components = nx.number_weakly_connected_components(G3)
print("Number of Weakly Connected Components:", num_components)

In [None]:
num_components = nx.number_strongly_connected_components(G3)
print("Number of Strongly Connected Components:", num_components)

In [None]:
largest_scc = max(nx.strongly_connected_components(G3), key=len)
G3_largest_scc = G3.subgraph(largest_scc)
avg_path_length = nx.average_shortest_path_length(G3_largest_scc)
avg_path_length

In [None]:
df_pac_other22_network_filtered_repb = df_pac_other22_network_filtered[df_pac_other22_network_filtered['party'] == 'R']

# Wrap long labels.
def split_label(label, max_length=25):
    return textwrap.fill(label, width=max_length)

# Duplicate edges as weights. No... should be dollar amount.
df_edges = df_pac_other22_network_filtered_repb[:7500].groupby(['filerid', 'recipid']).size().reset_index(name='weight')

# Make graph with weighted edges.
G4 = nx.from_pandas_edgelist(df_edges, source='filerid', target='recipid', edge_attr='weight', create_using=nx.DiGraph())
in_degrees = dict(G4.in_degree())
nodes_sorted = sorted(G4.nodes, key=in_degrees.get, reverse=True)

# Degree-based node size.
node_sizes_sorted = [in_degrees[node] * 10 for node in nodes_sorted]

# Top n degree labels.
label_dict = pd.Series(df_cmtes22.pacshort.values, index=df_cmtes22.cmteid).to_dict()
top_n_nodes = sorted(in_degrees, key=in_degrees.get, reverse=True)[:20]
top_n_labels = {node: split_label(label_dict[node].upper()) for node in top_n_nodes if node in label_dict}

# Party node color.
df_pac_other22['filerid'] = df_pac_other22['filerid'].fillna('X')
df_pac_other22['filerid'] = df_pac_other22['filerid'].astype(str)
color_map = {'D': 'blue', 'R': 'red', 'G': 'green', 'I': 'brown', 'X': 'gray'}
node_colors = {row['filerid']: color_map.get(row['party'], 'gray') for _, row in df_pac_other22.iterrows()}
node_color_list_sorted = [node_colors.get(str(node), 'gray') for node in nodes_sorted]

# Party edge color.
in_degree_values = [in_degrees[target] for _, target in G4.edges]
norm = plt.Normalize(vmin=min(in_degree_values), vmax=max(in_degree_values))
# cmap = cm.Blues
edge_colors = [node_colors.get(str(source), 'gray') for source, _ in G4.edges]

# Node spacing.
initial_pos = {node: (0.5, 0.5) for node in top_n_nodes}
pos = nx.spring_layout(G4, pos=initial_pos, fixed=top_n_nodes, k=1.25, scale=1.5, iterations=200)

edge_widths = [G4[u][v]['weight'] * 0.1 for u, v in G4.edges]

# Draw.
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G4)
nx.draw_networkx_edges(G4, pos, width=edge_widths, edge_color=edge_colors) #0.15 width
nx.draw_networkx_nodes(G4, pos, nodelist=nodes_sorted, node_color=node_color_list_sorted, node_size=node_sizes_sorted, edgecolors='white', linewidths=1)
nx.draw_networkx_labels(G4, pos, labels=top_n_labels, font_size=5, font_color="black", bbox=dict(facecolor='white', edgecolor='none', boxstyle='round,pad=0.4'))
plt.show()

In [None]:
len(G4.nodes)

In [None]:
global_clustering_coefficient = nx.average_clustering(G4)
print("Global Clustering Coefficient:", global_clustering_coefficient)

In [None]:
num_components = nx.number_weakly_connected_components(G4)
print("Number of Weakly Connected Components:", num_components)

In [None]:
num_components = nx.number_strongly_connected_components(G4)
print("Number of Strongly Connected Components:", num_components)

In [None]:
largest_scc = max(nx.strongly_connected_components(G4), key=len)
G4_largest_scc = G4.subgraph(largest_scc)
avg_path_length = nx.average_shortest_path_length(G4_largest_scc)
avg_path_length

In [None]:
duplicates = df_pac_other22_network_filtered[df_pac_other22_network_filtered.duplicated()]
print(duplicates)

In [None]:
import itertools
import matplotlib.cm as cm

C = nx.community.greedy_modularity_communities(G2)

color_map = {'D': 'blue', 'R': 'red', 'G': 'green', 'I': 'brown', 'X': 'gray'}
node_colors = {}

for community in C:
    for node in community:
        party = G2.nodes[node].get('party', 'X')
        node_colors[node] = color_map.get(party, 'gray')

node_color_list = [node_colors.get(node, 'gray') for node in G2.nodes]
plt.figure(figsize=(10, 10))
pos = nx.spring_layout(G2, k=1, scale=2, iterations=200)
nx.draw(G2, pos, node_color=node_color_list, with_labels=False, node_size=2, edge_color='gray', width=0.15, edgecolors='white')
plt.show()

In [None]:
num_components = nx.number_weakly_connected_components(G2)
print("Number of Weakly Connected Components:", num_components)

In [None]:
num_components = nx.number_strongly_connected_components(G2)
print("Number of Strongly Connected Components:", num_components)

In [None]:
largest_scc = max(nx.strongly_connected_components(G2), key=len)
G2_largest_scc = G2.subgraph(largest_scc)
avg_path_length = nx.average_shortest_path_length(G2_largest_scc)
avg_path_length