## Create a network of the latent factor analysis

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv
from pickle import load as pkl_load
from igraph import Graph, Plot
import matplotlib.pyplot as plt
from IPython.display import Image
from re import match
import cairocffi
import leidenalg
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from json import dump as json_dump

#### set notebook variables

In [None]:
# parameters
project = 'aging_phase2'
model_type = 'nmf'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'

# in files
assoc_file = f'{results_dir}/{project}.latent.age_glm.csv'
loadings_file = f'{results_dir}/{project}.latent.loadings.csv'
loadings_pickle = f'{results_dir}/{project}.latent.loadings.pkl'
metrics_file = f'{results_dir}/{project}.latent.metrics.csv'

# out files
graphml_file = f'{figures_dir}/{project}.latents.{model_type}.graphml'
gml_file = f'{figures_dir}/{project}.latents.{model_type}.gml'
image_file = f'{figures_dir}/{project}.latents.{model_type}.png'
communities_file = f'{figures_dir}/{project}.latents.{model_type}.partitioned_factors.json'

# constants and variables
DEBUG = True
ALPHA = 0.05

### load input data

#### load the latent factor age associations

In [None]:
age_glm_df = read_csv(assoc_file, index_col=0)
print(f'shape of age_glm_df is {age_glm_df.shape}')
age_glm_df['key_name'] = age_glm_df.cell_type + ':' + age_glm_df.feature
if DEBUG:
    display(age_glm_df.sample(4))
    print(f'age_glm_df has {age_glm_df.key_name.nunique()} keys')

#### load the latent factor's feature loadings

In [None]:
with open(loadings_pickle, 'rb') as pkl_file:
    feature_loadings = pkl_load(pkl_file)
print(f'loadings_pickle has {len(feature_loadings)} entries')

### subset the latent factor to only those with a statistically significant age association

In [None]:
age_glm_df = age_glm_df.loc[(age_glm_df.fdr_bh <= ALPHA) & (age_glm_df.model_type == model_type)]
print(f'shape of age_glm_df is {age_glm_df.shape}')
if DEBUG:
    display(age_glm_df.sample(4))

### convert the latent factor age associations into a weighted graph

In [None]:
age_graph = Graph()

# add the nodes
# age_graph.add_vertex('Age', type='root')
# for factor in age_glm_df.key_name.unique():
#     # age_graph.add_vertex(factor)
#     age_graph.add_vertex(name=factor, type='factor')
for row in age_glm_df.itertuples():
    # age_graph.add_vertex(factor)
    age_graph.add_vertex(name=row.key_name, type='factor', effect=abs(row.z))

# # add edges, age effect
# for row in age_glm_df.itertuples():
#     age_graph.add_edge('Age', row.key_name, weight=abs(row.z))

print(age_graph.vcount())
print(age_graph.ecount())

### add the latent factor feature loading as weighted graph

In [None]:
# regex pattern for match ATAC peaks naming format
pattern = r'^chr.*:.*-.*$'
for factor in age_glm_df.key_name.unique():
    loading = feature_loadings.get(factor)
    for feature, weight in loading.items():
        # # only add genes not ATAC peaks, very large and slow otherwise
        if not match(pattern, feature):
            age_graph.add_vertex(feature, type='feature', effect=1)
            age_graph.add_edge(factor, feature, weight=abs(weight))
            

print(age_graph.vcount())
print(age_graph.ecount())

#### drop nodes that don't have any edges
if ATAC peak features were excluded some of the latent factors will be empty

In [None]:
# Get the list of isolated vertex indices
isolated_vertices = age_graph.vs.select(_degree=0).indices
# Delete the isolated vertices
age_graph.delete_vertices(isolated_vertices)
print(age_graph.vcount())
print(age_graph.ecount())

### partition the graph

In [None]:
%%time
graph_cluster = leidenalg.find_partition(age_graph, leidenalg.ModularityVertexPartition)

In [None]:
print(len(graph_cluster))

In [None]:
len(graph_cluster.membership)

### draw the graph

In [None]:
import igraph as ig
visual_style = {}
visual_style['bbox'] = (1600, 1200)
visual_style['margin'] = 50

p = Plot(image_file, bbox = (1600, 1200), background='white')
# layout = age_graph.layout_mds()
layout = age_graph.layout_kamada_kawai()
pal = ig.drawing.colors.ClusterColoringPalette(len(graph_cluster))
age_graph.vs['membership'] = graph_cluster.membership
age_graph.vs['color'] = pal.get_many(graph_cluster.membership)
age_graph.es['color'] = 'rgba(192, 192, 192, 0.3)'
# scale effect sizes to use as node size
scaled_values = MinMaxScaler(feature_range=(6, 18)).fit_transform(np.array(age_graph.vs['effect']).reshape(-1,1))
p.add(age_graph, layout = layout, vertex_size = scaled_values, vertex_label_size = 10, **visual_style)
p.redraw()

# p.show()
p.save()
display(Image(image_file))

### save the graph

In [None]:
age_graph.write_gml(gml_file)
age_graph.write_graphml(graphml_file)

### inspect the partitioned latent factors

In [None]:
factor_nodes = age_graph.vs.select(type='factor')
print(len(factor_nodes))

In [None]:
community_factors = {}
for vertex in factor_nodes:
    members = community_factors.get(vertex.attributes().get('membership'))
    if members:
        members.append(vertex.attributes().get('name'))
    else:
        members = [vertex.attributes().get('name')]
    community_factors[vertex.attributes().get('membership')] = members

In [None]:
display(community_factors)

### save the partitioned latent factor communities

In [None]:
communities_file
with open(communities_file, 'w') as o_file:
    json_dump(community_factors, o_file, indent=4)

In [None]:
!date