In [1]:
import pandas as pd
import networkx as nx
import glob

In [2]:
file_list = glob.glob("../examples/outputs/CA_cd_2020_map/*.csv")
processed_files_prefix = "../examples/reports/CA_cd_2020_map_to_52_"

In [3]:
len(file_list)

5

In [4]:
me_df = pd.DataFrame()
file_name_list = []

# The value -1 is meant to indicate that the mega 
# parent does not exist. This is overwritten if 
# the parent is found to exist.
counts = [-1 for i in range(len(file_list))]

for file_name in file_list:
    file_name_list.append(file_name.split("/")[-1].split(".")[0])
    
    
me_df["File Name"] = file_name_list
me_df["Number Original Ancestors"] = counts
me_df["Mega Parent 0.01"] = counts
me_df["Mega Parent 0.10"] = counts
me_df["Mega Parent 0.25"] = counts
me_df["Mega Parent 0.50"] = counts
me_df["Mega Parent 0.75"] = counts
me_df["Mega Parent 1.00"] = counts

In [5]:
def add_orig_count(file_name, new_df):
    df = pd.read_csv(file_name)

    n_simulations = df["draw"].max() 
    n_districts = df["district"].max()

    graph = nx.grid_2d_graph(n_simulations, n_districts)
    graph.remove_edges_from(graph.edges())
    
    good_node_set = set()
    
    for node, attrs in graph.nodes(data=True):
        attrs["n_descendents"] = 0 
    
    for i in range(0,1):
        progenitor_vector = df[df["district"] == n_districts - i]["parent"] - 1
        for old, new in enumerate(progenitor_vector):
            graph.add_edge((old, i), (new, i + 1))
            graph.nodes[(new, i+1)]["n_descendents"] += 1
            good_node_set.add((new, i+1))
        
    for i in range(1,n_districts-1):
        progenitor_vector = df[df["district"] == n_districts - i]["parent"] - 1
        for old, new in enumerate(progenitor_vector):
            if graph.degree((old,i)) > 0:
                graph.add_edge((old, i), (new, i + 1))
                graph.nodes[(new, i+1)]["n_descendents"] += graph.nodes[(old, i)]["n_descendents"]
                good_node_set.add((new, i+1))

    good_node_list = list(good_node_set) 
   
    # So that we start from the top and work our way down 
    good_node_list.sort(key=lambda item: (-item[1], item[0]))


    runfile_name = file_name.split("/")[-1].split(".")[0]
    count = 0
    for i in range(n_simulations):
        count += graph.degree((i, n_districts - 1)) > 0
    
    new_df.loc[new_df["File Name"] == runfile_name, "Number Original Ancestors"] = count
    
    thresholds = [0.01, 0.1, 0.25, 0.5, 0.75, 1.0]

    for node in good_node_list:
        generation = n_districts - node[1]
        share = graph.nodes[node]["n_descendents"]

        for threshold in thresholds:
            if share >= threshold * n_simulations:
                column_name = f"Mega Parent {threshold:.2f}"
                new_df.loc[new_df["File Name"] == runfile_name, column_name] = generation
            else:
                break  # Break the loop if a threshold is not met

    print(f"Processed {file_name}")
    

In [6]:
for file in file_list:
    add_orig_count(file, me_df)

Processed ../examples/outputs/CA_cd_2020_map/14328277_CA_cd_2020_map_with_20_sims.csv
Processed ../examples/outputs/CA_cd_2020_map/3654992_CA_cd_2020_map_with_20_sims.csv
Processed ../examples/outputs/CA_cd_2020_map/80191041_CA_cd_2020_map_with_20_sims.csv
Processed ../examples/outputs/CA_cd_2020_map/45079254_CA_cd_2020_map_with_20_sims.csv
Processed ../examples/outputs/CA_cd_2020_map/93906742_CA_cd_2020_map_with_20_sims.csv


In [7]:
import os

file_name = f"{processed_files_prefix}{len(file_list)}_results_ancestors_and_phi.csv"

dir_name = os.path.dirname(file_name)

os.makedirs(dir_name, exist_ok=True)

me_df.to_csv(file_name, index=False)

In [8]:
me_df

Unnamed: 0,File Name,Number Original Ancestors,Mega Parent 0.01,Mega Parent 0.10,Mega Parent 0.25,Mega Parent 0.50,Mega Parent 0.75,Mega Parent 1.00
0,14328277_CA_cd_2020_map_with_20_sims,1,51,51,51,51,45,34
1,3654992_CA_cd_2020_map_with_20_sims,1,51,51,51,50,44,44
2,80191041_CA_cd_2020_map_with_20_sims,1,51,51,50,50,49,44
3,45079254_CA_cd_2020_map_with_20_sims,1,51,51,51,50,49,48
4,93906742_CA_cd_2020_map_with_20_sims,1,51,51,51,50,50,46
