In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import numpy as np
import h5py
import nibabel as nib
from datetime import datetime

from mfs_tools.library.utility_stuff import (
    correlate_bold, compare_mats
)
from mfs_tools.library.cifti_stuff import (
    get_brain_model_axes, get_label_axes, get_subcortical_indices
)
from mfs_tools.library.file_stuff import (
    find_infomap_path, find_wb_command_path,
    read_pajek_file, write_pajek_file, compare_pajek_data,
    load_infomap_clu_file
)


save_to = Path("/mnt/cache/pfm_python")
ml_base_path = Path("/mnt/cache/pfm_matlab")
python_cifti_path = (
    save_to /
    "sub-ME01_task-rest_concatenated_demeaned_regressed_and_smoothed-2.55_32k_fsLR.dtseries.nii"
)
matlab_cifti_path = (
    ml_base_path /
    "sub-ME01_task-rest_concatenated_demeaned_regressed_and_smoothed-2.55_32k_fsLR.dtseries.nii"
)
surface_files = {
    'lh': Path(
        "/mnt/brunodata/open_data/ds005118/derivatives/sub-ME01/fs_LR/fsaverage_LR32k"
        "/ME01.L.midthickness.32k_fs_LR.surf.gii"
    ),
    'rh': Path(
        "/mnt/brunodata/open_data/ds005118/derivatives/sub-ME01/fs_LR/fsaverage_LR32k"
        "/ME01.R.midthickness.32k_fs_LR.surf.gii"
    ),
}
python_distance_matrix_path = save_to / "dist_complete.npy"
matlab_distance_matrix_path = ml_base_path / "DistanceMatrix.mat"

wb_command_path = find_wb_command_path()

# The infomap executable was installed via pip in our virtual environment.
path_to_infomap = find_infomap_path("/home/mike/.virtualenvs/fmri/bin/infomap")
work_dir = Path("/mnt/cache/pfm_python/infomap_work")
work_dir.mkdir(exist_ok=True, parents=True)

for p in (
    save_to, ml_base_path, python_cifti_path, matlab_cifti_path,
    surface_files['lh'], surface_files['rh'],
    python_distance_matrix_path, matlab_distance_matrix_path,
    wb_command_path, path_to_infomap, work_dir,
):
    if not p.exists():
        print(f"File '{str(p)}' does not exist.")

#  5.5 to 5.7GB RAM

In [2]:
# Load input data

# For initial debugging, use the matlab data so any minor differences
# are due to this code, not prior variability
use_matlab = False

if use_matlab:
    smoothed_cifti_img = nib.Cifti2Image.from_filename(matlab_cifti_path)
    ml_distance_dict = h5py.File(matlab_distance_matrix_path, 'r')
    distance_matrix = np.array(ml_distance_dict.get('D'), dtype=np.uint8)
else:
    smoothed_cifti_img = nib.Cifti2Image.from_filename(python_cifti_path)
    distance_matrix = np.load(python_distance_matrix_path)

# 5.7GB to 14.2GB in 3:35, 7.2GB to 15.8GB in 2:56

In [3]:
distance_threshold = 10
print(f"The distance matrix is {distance_matrix.shape}-shaped.")
print(f"It has {np.sum(distance_matrix > 0.0):,} non-zero edges.")
print(f"It has {np.sum(distance_matrix == 0.0):,} zero edges.")
print(f"It has {np.sum(distance_matrix <= distance_threshold):,} local edges.")

# These numbers match matlab exactly, as they should.

The distance matrix is (85059, 85059)-shaped.
It has 7,234,948,400 non-zero edges.
It has 85,081 zero edges.
It has 17,574,597 local edges.


In [4]:
# Set up parameters for infomap
graph_densities = sorted(
    [0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.005, 0.01, 0.02, 0.05, ],
    reverse=True,
)
num_reps = 50
bad_vertices = list()

verbose = True

# I doubt there's any overlap between the randomization between matlab
# and python/numpy/scipy, but Lynch set his seed at 44 and I'll do the
# same here.
np.random.seed(44)


In [5]:
# Ensure we have more than zero valid structures to work with
# I don't know why, but Lynch's code only includes the 10 regions
# with distinct LEFT or RIGHT, but this excludes BRAIN_STEM.
# I'll exclude it here, too, so code matches, but maybe we can
# revisit if we have brainstem-specific hypotheses.
# Lynch also put ACCUMBENS in his code twice, one of them
# probably replacing 'DIENCEPHALON_VENTRAL', so only
# 18 regions remain out of the original 21. For debugging, we'll try to
# match exactly, but in practice, using all regions seems better.
anat_axis = get_brain_model_axes(smoothed_cifti_img)
# The 'anat_axis' already has 85,059 loci,
# just like matlab's BrainStructure and BrainStructureLabel,
# so we don't need to do anything else with it here.
structures = np.unique([
    str(name) for name in anat_axis.name
    if ("BRAIN_STEM" not in name) & ('DIENCEPHALON' not in name)
])
# There are 20 structures in 'structures'

# We need to get indices into each region, so gather all potentials
# and cross reference with what we have in our data
potential_structures = np.unique([str(name) for name in anat_axis.name])
print(f"Found {len(potential_structures)} potential structures, "
      f"But we only want {len(structures)}.")
structure_mask = [s in structures for s in potential_structures]
structure_indices = np.where(structure_mask)[0]
good_structures = potential_structures[structure_indices]
print(f"Indexing leaves us with {len(structure_indices)} structures")

# Now go through each locus to determine if it's in our list of structures.
locus_mask = [str(s) in good_structures for s in anat_axis.name]
locus_indices = np.where(locus_mask)[0]
print(f"Filtering structures leaves {len(locus_indices)} loci")

# Further, remove any pre-designated bad vertices
locus_indices = [li for li in locus_indices if li not in bad_vertices]
print(f"Removing bad verts leaves {len(locus_indices)} loci")

if verbose:
    print(f"Using {len(good_structures)} structures: ")
    for i, s in enumerate(good_structures):
        print(f"{i + 1:>2}. {str(s)}")


Found 21 potential structures, But we only want 18.
Indexing leaves us with 18 structures
Filtering structures leaves 81407 loci
Removing bad verts leaves 81407 loci
Using 18 structures: 
 1. CIFTI_STRUCTURE_ACCUMBENS_LEFT
 2. CIFTI_STRUCTURE_ACCUMBENS_RIGHT
 3. CIFTI_STRUCTURE_AMYGDALA_LEFT
 4. CIFTI_STRUCTURE_AMYGDALA_RIGHT
 5. CIFTI_STRUCTURE_CAUDATE_LEFT
 6. CIFTI_STRUCTURE_CAUDATE_RIGHT
 7. CIFTI_STRUCTURE_CEREBELLUM_LEFT
 8. CIFTI_STRUCTURE_CEREBELLUM_RIGHT
 9. CIFTI_STRUCTURE_CORTEX_LEFT
10. CIFTI_STRUCTURE_CORTEX_RIGHT
11. CIFTI_STRUCTURE_HIPPOCAMPUS_LEFT
12. CIFTI_STRUCTURE_HIPPOCAMPUS_RIGHT
13. CIFTI_STRUCTURE_PALLIDUM_LEFT
14. CIFTI_STRUCTURE_PALLIDUM_RIGHT
15. CIFTI_STRUCTURE_PUTAMEN_LEFT
16. CIFTI_STRUCTURE_PUTAMEN_RIGHT
17. CIFTI_STRUCTURE_THALAMUS_LEFT
18. CIFTI_STRUCTURE_THALAMUS_RIGHT


In [10]:
from mfs_tools import red_on, green_on, color_off
excluded_regions = ["BRAIN_STEM", ]
all_structures, all_counts = np.unique(
    [str(name) for name in anat_axis.name], return_counts=True
)
excluded_structures = list()
structures = list()
for i, structure in enumerate(all_structures):
    locus_str = "vertices" if "CORTEX" in structure else "voxels"
    if verbose:
        print(f"  {structure: <42}  ", end="")
    for region in excluded_regions:
        if (    (region in structure) or
                (f"{region}_LEFT" in structure) or
                (f"{region}_RIGHT" in structure)
        ):
            excluded_structures.append(structure)
            if verbose:
                print(f"{red_on}-{color_off}  ({all_counts[i]:,} {locus_str})")
        else:
            structures.append(structure)
            if verbose:
                print(f"{green_on}+{color_off}  ({all_counts[i]:,} {locus_str})")
if verbose:
    print(f"  found {len(all_structures)} structures, "
          f"excluded {len(excluded_structures)} of them,"
          f"and kept {len(structures)} of them.")


  CIFTI_STRUCTURE_ACCUMBENS_LEFT              [1;32m+[0m  (47 voxels)
  CIFTI_STRUCTURE_ACCUMBENS_RIGHT             [1;32m+[0m  (43 voxels)
  CIFTI_STRUCTURE_AMYGDALA_LEFT               [1;32m+[0m  (183 voxels)
  CIFTI_STRUCTURE_AMYGDALA_RIGHT              [1;32m+[0m  (232 voxels)
  CIFTI_STRUCTURE_BRAIN_STEM                  [0;31m-[0m  (2,634 voxels)
  CIFTI_STRUCTURE_CAUDATE_LEFT                [1;32m+[0m  (428 voxels)
  CIFTI_STRUCTURE_CAUDATE_RIGHT               [1;32m+[0m  (426 voxels)
  CIFTI_STRUCTURE_CEREBELLUM_LEFT             [1;32m+[0m  (7,846 voxels)
  CIFTI_STRUCTURE_CEREBELLUM_RIGHT            [1;32m+[0m  (8,113 voxels)
  CIFTI_STRUCTURE_CORTEX_LEFT                 [1;32m+[0m  (29,696 vertices)
  CIFTI_STRUCTURE_CORTEX_RIGHT                [1;32m+[0m  (29,716 vertices)
  CIFTI_STRUCTURE_DIENCEPHALON_VENTRAL_LEFT   [1;32m+[0m  (489 voxels)
  CIFTI_STRUCTURE_DIENCEPHALON_VENTRAL_RIGHT  [1;32m+[0m  (529 voxels)
  CIFTI_STRUCTURE_HIPPOCAMPUS_LEFT   

In [6]:
# Trim the distance matrix to match our filtered vertices.
# While we're at it, set subcort-subcort distances to zero.
print(f"Original distance matrix is shaped {distance_matrix.shape}")
subcort_indices = get_subcortical_indices(smoothed_cifti_img)
distance_matrix[np.ix_(subcort_indices, subcort_indices)] = 0.0
print(f"D1 is shaped {distance_matrix.shape}")
print(f"D1 has {np.sum(distance_matrix > 0.0):,} non-zero edges.")
print(f"D1 has {np.sum(distance_matrix == 0.0):,} zero edges.")
print(f"D1 has {np.sum(distance_matrix <= distance_threshold):,} local edges.")

# These numbers, too, match matlab exactly (but only after using the np.ix_ function)

Original distance matrix is shaped (85059, 85059)
D1 is shaped (85059, 85059)
D1 has 6,577,205,438 non-zero edges.
D1 has 657,828,043 zero edges.
D1 has 665,815,249 local edges.


In [7]:
distance_matrix = distance_matrix[locus_indices, :][:, locus_indices]
print(f"D2 is shaped {distance_matrix.shape}")
print(f"D2 has {np.sum(distance_matrix > 0.0):,} non-zero edges.")
print(f"D2 has {np.sum(distance_matrix == 0.0):,} zero edges.")
print(f"D2 has {np.sum(distance_matrix <= distance_threshold):,} local edges.")

# 14.2GB to 13.7GB in 1:41
# These counts match matlab exactly

D2 is shaped (81407, 81407)
D2 has 6,143,260,190 non-zero edges.
D2 has 483,839,459 zero edges.
D2 has 491,759,083 local edges.


In [7]:
"""  I don't think we need this; from externally correlating connectivity

# This is a pain in the ass. We could theoretically just create a correlation
# matrix with numpy, but it runs python (not the VM) out of memory, even on a
# VM with 200GB RAM. So we need to be smarter with memory consumption. A quick
# workaround is to just let Workbench do the correlation for us.
# To do that, we need to prep and save our BOLD data to disk,
# then do the correlation, then read it back in.
final_bold_anat_axis = get_brain_model_axes(smoothed_cifti_img, verbose=True)[locus_indices]
final_bold_time_axis = get_series_axes(smoothed_cifti_img)
final_header = nib.cifti2.Cifti2Header.from_axes((
    final_bold_time_axis, final_bold_anat_axis
))
final_dtseries_img = nib.Cifti2Image(
    smoothed_cifti_img.get_fdata()[:, locus_indices],
    final_header
)
final_bold_file = (
    save_to /
    "sub-ME01_task-rest_concatenated_demeaned_regressed_smoothed-2.55_and_trimmed_32k_fsLR.dtseries.nii"
)
dconn_file = (
    save_to /
    "sub-ME01_task-rest_concatenated_demeaned_regressed_smoothed-2.55_and_trimmed_32k_fsLR.dconn.nii"
)
final_dtseries_img.to_filename(final_bold_file)
"""

  discovered axis 0: <class 'nibabel.cifti2.cifti2_axes.SeriesAxis'>
  discovered axis 1: <class 'nibabel.cifti2.cifti2_axes.BrainModelAxis'>


In [8]:
# Trim the BOLD data (loaded from matlab's directory earlier) to match
# locations in distance matrix.
bold_data = smoothed_cifti_img.get_fdata()[:, locus_indices]
if (save_to / "full_connectivity.npy").exists():
    print(f"Loading full connectivity from {save_to / 'full_connectivity.npy'}")
    full_connectivity = np.load(save_to / "full_connectivity.npy")
else:
    full_connectivity = correlate_bold(bold_data, strip_size=4096, verbose=True)
    np.save(save_to / "full_connectivity.npy", full_connectivity)

# built 13.7GB to 43.8GB in 4:17, loaded 15.2GB to 45.2GB in 1:17

Loading full connectivity from /mnt/cache/pfm_python/full_connectivity.npy


In [9]:
## On with the actual task at hand, prepping the connectivity for infomap
if (save_to / "final_connectivity.npy").exists():
    print(f"Loading final connectivity from {save_to / 'final_connectivity.npy'}")
    connectivity = np.load(save_to / "final_connectivity.npy")
    print(f"{np.sum(connectivity > 0.0):,} usable edges in the loaded final connectivity.")
else:
    connectivity = full_connectivity.copy()
    print(f"Starting with {np.sum(connectivity > 0.0):,} edges.")

    # Remove the diagonal (set to zero)
    connectivity[np.diag_indices_from(connectivity)] = 0.0
    print(f"{np.sum(connectivity > 0.0):,} edges remain after removing diagonal")

    # Remove local edges
    # (because we set all subcortical-subcortical edges to zero,
    #  this also removes them all.)
    connectivity[distance_matrix <= distance_threshold] = 0.0
    print(f"{np.sum(connectivity > 0.0):,} edges remain after removing local edges")

    # Remove any NaN values
    connectivity[np.isnan(connectivity)] = 0.0
    print(f"{np.sum(connectivity > 0.0):,} edges remain after removing NaNs")

    # Save connectivity for comparison with matlab's filtered connectivity.
    np.save(save_to / "final_connectivity.npy", connectivity)

# ? 43.8GB to 43.8GB in 2:22, loaded 45.3GB to 71.9GB in 1:16
# These counts are identical to those from matlab
# Expect 3,005,438,468 non-zero edges

3,005,438,468 usable edges in the loaded final connectivity.


In [10]:

# Load matlab's final connectivity
ml_conn_file = ml_base_path / "FinalConnectivity.mat"
ml_conn_dict = h5py.File(ml_conn_file, 'r')
ml_final_connectivity = np.array(ml_conn_dict.get('m'), dtype=np.float32)

# loaded 71.9GB to 99.4GB in 7:35


In [11]:
# Comparing the entire matrices could exhaust memory, so we'll just
# spot check a few areas.
# Some top rows
compare_mats(
    connectivity[:16384,:],
    ml_final_connectivity[:16384,:],
    a_name="python", b_name="matlab",
    verbose=True
)
# Some columns in the middle
compare_mats(
    connectivity[:, 16384:32768],
    ml_final_connectivity[:, 16384:32768],
    a_name="python", b_name="matlab",
    verbose=True
)
# A block with some subcortical regions
compare_mats(
    connectivity[49152:73728, 49152:73728],
    ml_final_connectivity[49152:73728, 49152:73728],
    a_name="python", b_name="matlab",
    verbose=True
)


[1;32m  The matrices 'python' and 'matlab' are equal, with tolerance of 1e-05.[0m
  Mem before 87,618.1MB; Mem after 87,618.6MB; delta 0.5
[1;32m  The matrices 'python' and 'matlab' are equal, with tolerance of 1e-05.[0m
  Mem before 87,618.6MB; Mem after 87,618.6MB; delta 0.0
[1;32m  The matrices 'python' and 'matlab' are equal, with tolerance of 1e-05.[0m
  Mem before 87,618.6MB; Mem after 87,618.6MB; delta 0.0


True

In [47]:
# Next, go through the connectivity matrix, with local edges, diagonals,
# and NaNs removed, and sort each column to find
# the highest connectivity edges for each locus.
# For each column, set the top edges (thresholded by graph density) to True,
# leaving everything else False.
# This ensures that even weakly connected loci maintain some
# connectivity to their hub-like partners.

total_edges_kept = 0  # not accurate, some overlap, but useful for comparisons

for i_d, d in enumerate(graph_densities):
    print(f"Starting density {d} at {datetime.now()}...")
    log_notes = list()

    # Create a mask with all False until we decide what to keep.
    hi_conn_mask = np.zeros(connectivity.shape, dtype=bool)
    # for each column in the connectivity matrix, find the highest
    # correlations and add those edges to the mask for that location's
    # column AND row.
    for i_n in range(connectivity.shape[1]):
        if np.any(connectivity[:, i_n]):
            ordered_connectivity = np.flip(np.argsort(connectivity[:, i_n]))
            num_to_keep = int(np.ceil(d * len(ordered_connectivity)))
            total_edges_kept += num_to_keep
            log_notes.append(
                f"Keeping {num_to_keep:,} edges for density {d}, col {i_n}"
            )
            # for v in ordered_connectivity[:num_to_keep]:
            #     print(f"Index {v} == {m[v, i_n]:0.3f}")
            hi_conn_mask[ordered_connectivity[:num_to_keep], i_n] = True
            hi_conn_mask[i_n, ordered_connectivity[:num_to_keep]] = True

    # We built the matrix up symmetrically, so now that it's complete,
    # we only need half of it. Delete the lower triangle, then
    # find the indices of the masked edges.
    hi_conn_mask[np.tril_indices_from(hi_conn_mask)] = False
    hi_conn_idx = np.argwhere(hi_conn_mask)
    write_pajek_file(
        hi_conn_idx, connectivity, save_to / f"hi_conn_d-{d:0.4f}.net",
        verbose=True
    )
    with open(save_to / f"hi_conn_d-{d:0.4f}.log", 'w') as f:
        f.write("\n".join(log_notes))

    print(f"Finished density {d} at {datetime.now()}...")

# 43.9GB
#   for d=0.05: slow to 50, fast to 110, drop to 104, drop to 54, drop to 48 in ~15 min
#   for d=0.02: slow to 50, fast to 113, drop to 108, drop fast to 55, drop to 52, drop to 46.5 in ~11 min
#   for d=0.01: to 104.8GB, drop fast to 52.6GB

Starting density 0.05 at 2025-01-16 19:29:46.035625...
Wrote 81407 vertices, and 253022347 edges to '/mnt/cache/pfm_python/hi_conn_d-0.0500.net'
Finished density 0.05 at 2025-01-16 19:46:03.743309...
Starting density 0.02 at 2025-01-16 19:46:03.743576...
Wrote 81407 vertices, and 106869594 edges to '/mnt/cache/pfm_python/hi_conn_d-0.0200.net'
Finished density 0.02 at 2025-01-16 19:57:39.861257...
Starting density 0.01 at 2025-01-16 19:57:39.861451...
Wrote 81407 vertices, and 55118427 edges to '/mnt/cache/pfm_python/hi_conn_d-0.0100.net'
Finished density 0.01 at 2025-01-16 20:07:38.294529...
Starting density 0.005 at 2025-01-16 20:07:38.294647...
Wrote 81407 vertices, and 28321685 edges to '/mnt/cache/pfm_python/hi_conn_d-0.0050.net'
Finished density 0.005 at 2025-01-16 20:16:15.772485...
Starting density 0.002 at 2025-01-16 20:16:15.772603...
Wrote 81407 vertices, and 11655995 edges to '/mnt/cache/pfm_python/hi_conn_d-0.0020.net'
Finished density 0.002 at 2025-01-16 20:24:21.153913...

In [12]:
# The above code block wrote Pajek network definition files for each threshold.
# Here, read them.

py_net2 = read_pajek_file(save_to / f"hi_conn_d-0.0002.net")
ml_net2 = read_pajek_file(
    ml_base_path / f"Bipartite_Density0.0002.net"
)
py_net1 = read_pajek_file(save_to / f"hi_conn_d-0.0001.net")
ml_net1 = read_pajek_file(
    ml_base_path / f"Bipartite_Density0.0001.net"
)


In [13]:
diff_dict_1 = compare_pajek_data(py_net1, ml_net1)
# Python and matlab agreed 100% with identical networks.

In [14]:
diff_dict_2 = compare_pajek_data(py_net2, ml_net2)
# Python and matlab disagreed on only 1 of 1,285,725 edges, and the
# mismatched edges had identical connectivity.

Edges in A but not B: 1
Edges in B but not A: 1


In [52]:
# Memory on henry running 6 infomaps climbed from 12GB after deleting m
# gradually higher over 10ish minutes. It eventually exhausted memory and rebooted.
# Be careful with memory consumption. I'm tracking it in several places
# with valgrind to spec VMs for running this.

# Running infomap creates a .clu and a .log file to accompany the .net Pajek file.


In [55]:
connectivity, full_connectivity = None, None
ordered_connectivity = None
distance_matrix, bold_data = None, None

In [56]:
hi_conn_idx, hi_conn_mask = None, None

In [57]:
smoothed_cifti_img = None


In [58]:
# Pre-allocate data for a new Cifti file of Infomap data
infomap_data = np.zeros(
    (len(anat_axis), len(graph_densities)),
    dtype=np.int8
)
# We need 81407 good locus indices ranging from 0 to 85058;
# They're already stored in locus_indices

# We previously ran infomap on the matlab data for fair comparisons across
# results, so load it from there rather than from our own python data.
ml_network_path = Path("/mnt/brunodata/open_data/ds005118/derivatives/sub-ME01/pfm_matlab")

# Extract community membership, matched by node_id, and
# save it into a single matrix, created just above.
for i, density in enumerate(graph_densities):
    # For a given density,
    # For python-local, clu_filename = f"hi_conn_d-{density:0.4f}.clu"
    clu_filename = f"Bipartite_Density{density}.clu"
    infomap_output = load_infomap_clu_file(
        ml_network_path / clu_filename, verbose=True
    )
    infomap_data[locus_indices, i] = infomap_output.sort_values(
        ['node_id', ]
    )['module'].values


Reading from '/mnt/brunodata/open_data/ds005118/derivatives/sub-ME01/pfm_matlab/Bipartite_Density0.05.clu'...
  found column names as ('node_id', 'module', 'flow')
  after 10 header lines, assuming the rest are data
  file contained 81,407 nodes.
Reading from '/mnt/brunodata/open_data/ds005118/derivatives/sub-ME01/pfm_matlab/Bipartite_Density0.02.clu'...
  found column names as ('node_id', 'module', 'flow')
  after 10 header lines, assuming the rest are data
  file contained 81,407 nodes.
Reading from '/mnt/brunodata/open_data/ds005118/derivatives/sub-ME01/pfm_matlab/Bipartite_Density0.01.clu'...
  found column names as ('node_id', 'module', 'flow')
  after 10 header lines, assuming the rest are data
  file contained 81,407 nodes.
Reading from '/mnt/brunodata/open_data/ds005118/derivatives/sub-ME01/pfm_matlab/Bipartite_Density0.005.clu'...
  found column names as ('node_id', 'module', 'flow')
  after 10 header lines, assuming the rest are data
  file contained 81,407 nodes.
Reading fro

In [59]:
print(infomap_data[:6, :])

[[ 4 10  1  1  2  2  7  1  3]
 [ 3  6  2  2  1  1  1  2  5]
 [ 3 13  2  2  1  1  1  3 10]
 [ 4  3  1  1  2  2  6  1  3]
 [ 5  2  2  3  4  4  4  4  1]
 [ 1  3  1  1  3  2  3  1  4]]


In [60]:
infomap_backup_data = infomap_data.copy()

In [64]:
total_edges_removed = 0
most_communities = 0
density_with_most_communities = 0.0
for dens_idx, density in enumerate(graph_densities):
    unique_communities = np.unique(infomap_data[:, dens_idx])
    print(density, unique_communities)
    if len(unique_communities) > most_communities:
        most_communities = len(unique_communities)
        density_with_most_communities = density
    for comm_idx, community_id in enumerate(unique_communities):
        if community_id != 0:
            community_idx = np.where(infomap_data[:, dens_idx]==community_id)[0]
            if len(community_idx) < 10:
                print(f"  Removing density {density}'s community {community_id} with only {len(community_idx)} members.")
                infomap_data[community_idx, dens_idx] = 0
                total_edges_removed += len(community_idx)

print(f"Removed {total_edges_removed:,} total edges due to small communities.")
print(f"The largest set of communities was {most_communities}, from d={density_with_most_communities:0.4f}.")


0.05 [0 1 2 3 4 5 6 7 8 9]
0.02 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
0.01 [0 1 2]
0.005 [0 1 2 3]
0.002 [0 1 2 3 4 5]
0.001 [0 1 2 3 4 5 6]
0.0005 [0 1 2 3 4 5 6 7 8]
0.0002 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13]
0.0001 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
Removed 0 total edges due to small communities.
The largest set of communities was 22, from d=0.0001.


In [62]:
compare_mats(infomap_data, infomap_backup_data, verbose=True)


  There are mismatches between 'a' (int8)  and 'b' (int8).
  Top left corners, for a small preview:
|    4.0000,  10.0000,   1.0000,   1.0000,   2.0000 |    |    4.0000,  10.0000,   1.0000,   1.0000,   2.0000 |
|    3.0000,   6.0000,   2.0000,   2.0000,   1.0000 |    |    3.0000,   6.0000,   2.0000,   2.0000,   1.0000 |
|    3.0000,  13.0000,   2.0000,   2.0000,   1.0000 | vs |    3.0000,  13.0000,   2.0000,   2.0000,   1.0000 |
|    4.0000,   3.0000,   1.0000,   1.0000,   2.0000 |    |    4.0000,   3.0000,   1.0000,   1.0000,   2.0000 |
|    5.0000,   2.0000,   2.0000,   3.0000,   4.0000 |    |    5.0000,   2.0000,   2.0000,   3.0000,   4.0000 |
[1;32m  Only 1 in 23921 values differ (32 of 765,495). [0m
[0;31m  The largest difference is 28.0 == 28.000000000 [0m
  Mem before 88,283.0MB; Mem after 88,283.0MB; delta 0.0


False

In [73]:
from mfs_tools.library.utility_stuff import generate_colormap

my_cm = generate_colormap(24)


In [105]:
first_label = nib.cifti2.Cifti2Label(0, "0", 0.0, 0.0, 0.0, 1.0)
rest_of_labels = [
    nib.cifti2.Cifti2Label(n + 1, f"{n + 1}", *my_cm.colors[n])
    for n in range(most_communities)
]
all_labels = [first_label, ] + rest_of_labels
packageable_labels = dict([
    (lbl.key, (lbl.label, (lbl.red, lbl.green, lbl.blue, lbl.alpha, )))
     for lbl in all_labels
])


In [106]:
a_label_axis = nib.cifti2.LabelAxis(
    [f"density {d:0.04f}" for d in graph_densities],
    packageable_labels,
)


In [109]:
# wb_view wants the brain_models along the columns, and the
# labels along the rows. So here we transpose our data and
# create axes to match.
network_label_img = nib.cifti2.Cifti2Image(
    infomap_data.T, (a_label_axis, anat_axis, )
)


In [110]:
# Save these labels as a Cifti file

network_label_img.update_headers()
network_label_img.to_filename(
    Path(f"/mnt/cache/pfm_python/") /
         f"infomap_calculated_network_atlases_over_9_densities.dlabel.nii"
)


In [113]:
_label_axis = get_label_axes(network_label_img)
_atlas_data = network_label_img.get_fdata()


In [115]:
_labels = _atlas_data[2, :]
np.unique(_labels, return_counts=True)

(array([0., 1., 2.]), array([ 3652, 42832, 38575]))

In [117]:
_mask = (_labels == 1).astype(np.uint8)
np.unique(_mask, return_counts=True)

(array([0, 1], dtype=uint8), array([42227, 42832]))

In [118]:
_label_axis.name[0]

np.str_('density 0.0500')

In [None]:
dict([
    (lbl.key, (lbl.label, (lbl.red, lbl.green, lbl.blue, lbl.alpha, )))
     for lbl in all_labels
])

In [119]:
_label_axis.label[2]

{0: ('0', (0.0, 0.0, 0.0, 1.0)),
 1: ('1', (0.2, 0.0, 0.0, 1.0)),
 2: ('2', (0.2571428571428572, 0.21441197911786147, 0.0, 1.0)),
 3: ('3', (0.09717383835030897, 0.31428571428571433, 0.0, 1.0)),
 4: ('4', (0.0, 0.3714285714285715, 0.19486330170979507, 1.0)),
 5: ('5', (0.0, 0.2650220903897375, 0.42857142857142866, 1.0)),
 6: ('6', (0.10464203589203577, 0.0, 0.4857142857142858, 1.0)),
 7: ('7', (0.5428571428571429, 0.0, 0.503540368613898, 1.0)),
 8: ('8', (0.6000000000000001, 0.12507365448541918, 0.0, 1.0)),
 9: ('9', (0.6293585587703238, 0.6571428571428573, 0.0, 1.0)),
 10: ('10', (0.07195242489360164, 0.7142857142857144, 0.0, 1.0)),
 11: ('11', (0.0, 0.7714285714285716, 0.5655240594136617, 1.0)),
 12: ('12', (0.0, 0.33965528046410415, 0.8285714285714287, 1.0)),
 13: ('13', (0.3959650941268589, 0.0, 0.8857142857142859, 1.0)),
 14: ('14', (0.9428571428571431, 0.0, 0.6780257997169767, 1.0)),
 15: ('15', (1.0, 0.4169121816180639, 0.0, 1.0)),
 16: ('16', (0.7492634551458082, 1.0, 0.0, 1.0)

In [120]:
one_comm_label_axis = nib.cifti2.LabelAxis(
    _label_axis.name[2:3],
    [_label_axis.label[2], ],
)

In [139]:
_mask.reshape(1, -1).shape
np.unique(_mask, return_counts=True)

(array([0, 1], dtype=uint8), array([42227, 42832]))

In [158]:
import subprocess

wb_command = find_wb_command_path()
d_i, d = 2, _label_axis.name[2]
c = 2
min_area = 50
filename = f"tmp_{d.replace(' ', '-')}_c-{c:02d}_mask.dscalar.nii"

_mask_data = (_labels == c).astype(np.uint8).reshape(1, -1)

community_scalar_axis = nib.cifti2.ScalarAxis(
    [f"community {c}", ]
)
# community_label_axis = nib.cifti2.LabelAxis(
#     [d, ], [_label_axis.label[d_i], ]
# )
community_img = nib.cifti2.Cifti2Image(
    _mask.reshape(1, -1), (community_scalar_axis, anat_axis)
)
community_img.to_filename(Path(work_dir) / filename)


In [159]:

proc = subprocess.run([
    wb_command,
    "-cifti-find-clusters",
    str(Path(work_dir) / filename),
    "0", str(min_area), "0", str(min_area), "COLUMN",
    str(Path(work_dir) / filename.replace("_mask", "_cmask")),
    "-left-surface", surface_files['lh'],
    "-right-surface", surface_files['rh'],
    "-merged-volume"
])


In [160]:
new_mask = nib.cifti2.Cifti2Image.from_filename(
    Path(work_dir) / filename.replace("_mask", "_cmask")
).get_fdata()
print(np.unique(_mask_data, return_counts=True))
print(np.unique(new_mask, return_counts=True))
# So there were 42,227 zeros in the binary mask, and now
# there are 42,342, indicating 115 voxels/vertices were in too-small islands.
# So let's investigate those.
_tiny_islets = ((new_mask == 0) & (_mask_data > 0)).astype(np.uint8)
islet_scalar_axis = nib.cifti2.ScalarAxis(
    [f"itty bitties", ]
)
islet_img = nib.cifti2.Cifti2Image(
    _tiny_islets, (islet_scalar_axis, anat_axis)
)
islet_img.to_filename(Path(work_dir) / "islets.2.dscalar.nii")


(array([0, 1], dtype=uint8), array([46484, 38575]))
(array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21.]), array([42342,  8759,  7094,    20,    27,    37,    29,  7615,  6440,
          23,  9172,     9,   386,  2741,    32,    44,   128,    29,
           7,    57,    53,    15]))


In [155]:
filtered_atlas_data = np.zeros(_atlas_data.shape)
print(filtered_atlas_data.shape)


(9, 85059)


In [161]:
print(np.unique(_atlas_data[d_i, :], return_counts=True))
filtered_atlas_data[d_i, new_mask.astype(np.bool).ravel()] = c
# filtered_atlas_data = _atlas_data[d_i, :].reshape(1, -1) * new_mask.astype(np.bool).astype(np.uint8)
print(np.unique(filtered_atlas_data, return_counts=True))


(array([0., 1., 2.]), array([ 3652, 42832, 38575]))
(array([0., 2.]), array([722814,  42717]))


In [150]:
_atlas_data[d_i, :].shape

(85059,)

In [135]:
compare_mats(_mask.reshape(1, -1), new_mask, verbose=True)


  There are mismatches between 'a' (uint8)  and 'b' (float64).
  Top left corners, for a small preview:
|    1.0000,   0.0000,   0.0000,   1.0000,   0.0000 |    |    1.0000,   0.0000,   0.0000,   2.0000,   0.0000 |
[0;31m  0 of 1 values differ. The mean difference, where there are differences,  is nan.[0m
[1;32m  The largest difference is nan == nan [0m
  Mem before 88,314.3MB; Mem after 88,314.3MB; delta 0.0


False

In [131]:
np.sum(np.isnan(_mask.reshape(1, -1)))

np.int64(0)

In [132]:
np.sum(np.isnan(new_mask))

np.int64(0)

In [133]:
_mask.reshape(1, -1).shape, new_mask.shape

((1, 85059), (1, 85059))

In [136]:
np.allclose(_mask.reshape(1, -1), new_mask)

False

In [137]:
np.unique(_mask.reshape(1, -1), return_counts=True)


(array([0, 1], dtype=uint8), array([42227, 42832]))

In [138]:
np.unique(new_mask, return_counts=True)


(array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14., 15., 16., 17., 18., 19., 20., 21.]),
 array([42342,  8759,  7094,    20,    27,    37,    29,  7615,  6440,
           23,  9172,     9,   386,  2741,    32,    44,   128,    29,
            7,    57,    53,    15]))

In [167]:
from mfs_tools.library.clustering_stuff import spatial_filter

new_img = spatial_filter(
    network_label_img, surface_files['lh'], surface_files['rh'],
    work_path=work_dir, verbose=True
)


Removing small islets from 'density 0.0500' communities.
    saving 19,306 of 19,544 members of '1'
    saving 16,646 of 16,885 members of '2'
    saving 12,701 of 12,819 members of '3'
    saving 14,932 of 15,116 members of '4'
    saving 10,261 of 10,371 members of '5'
    saving 3,764 of 4,091 members of '6'
    saving 2,361 of 2,425 members of '7'
    saving 0 of 130 members of '8'
    saving 0 of 12 members of '9'
Before:
(array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]), array([ 3666, 19544, 16885, 12819, 15116, 10371,  4091,  2425,   130,
          12]))
After:
(array([0., 1., 2., 3., 4., 5., 6., 7.]), array([ 5088, 19306, 16646, 12701, 14932, 10261,  3764,  2361]))
Removing small islets from 'density 0.0200' communities.
    saving 11,943 of 12,157 members of '1'
    saving 10,115 of 10,219 members of '2'
    saving 9,244 of 9,425 members of '3'
    saving 8,244 of 8,393 members of '4'
    saving 7,151 of 7,321 members of '5'
    saving 6,551 of 6,850 members of '6'
    saving 5