In [16]:
import pandas as pd
import numpy as np
import umap
from scipy.sparse import coo_matrix
import pickle
from scipy.stats import zscore
# import scipy.sparse
# import igraph as ig

In [17]:
def normalizer(array):
    """
    Normalizes the values of an array to range from zero to one
    """
    
    a = np.array(array)
    
    normalized = (array - np.min(array)) / (np.max(array) - np.min(array))
    
    return normalized

def zscore_normalizer(array):
    """
    Zscore normalizes the values of an array (mean = 0; stdev = 1)
    """
    
    a = np.array(array)
    
    normalized = zscore(array)
    
    return normalized

def normalize_expression_per_gene(expression_df):
    """
    Function to normalize all gene expression to range from zero to one.
    """
    if 'TTHERM_ID' in expression_df.columns:
        ttids = expression_df['TTHERM_ID'].values
        data = expression_df[list(expression_df.columns)[1:]]
        
        norm_expression_df = data.apply(lambda row: zscore_normalizer(row), axis=1) # FIXME
        norm_expression_df['TTHERM_ID'] = ttids
        
        columns = norm_expression_df.columns.tolist()
        
        rearrangment = columns[-1:] + columns[:-1]
        
        norm_expression_df = norm_expression_df[rearrangment]
        
    else:
        norm_expression_df = expression_df.apply(lambda row: normalizer(row), axis=1)
    
    return norm_expression_df

In [18]:
full_filtered_df = pd.read_csv('../../active_files/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')

full_filtered_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_000000042,6.928782,7.264201,6.934214,6.732989,6.970612,7.150978,6.126826,6.868968,6.641119,...,6.450318,8.04975,7.788162,7.052154,6.517742,6.918501,6.048861,7.041619,6.757932,5.817246
1,TTHERM_000000045,9.633489,9.977124,10.027529,9.720665,9.605762,10.225542,10.279608,10.459966,10.693337,...,11.130466,11.207738,11.009172,10.615417,11.038938,11.009222,10.216348,11.099187,11.172276,10.561021
2,TTHERM_00000010,5.066343,4.767264,5.010981,6.139047,4.619361,4.751761,5.81855,5.342529,5.48375,...,6.314438,7.423571,7.507645,7.417087,7.147801,7.74793,7.093641,7.672685,7.51129,6.890117
3,TTHERM_00000020,4.696881,4.638401,4.956299,6.942556,5.101252,4.730307,8.45769,4.526411,4.9083,...,5.250233,4.974993,5.747498,5.252167,5.210531,7.083187,5.252222,5.037613,5.495281,5.013987
4,TTHERM_00000030,4.654278,4.537105,4.928739,5.063991,4.584168,4.91188,5.935311,4.51947,4.757861,...,4.651688,4.920573,4.636333,4.883712,4.779395,4.744335,4.51314,4.838428,4.961475,4.65334


In [19]:
full_filtered_norm_df = normalize_expression_per_gene(full_filtered_df)
full_filtered_norm_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_000000042,0.225122,0.742666,0.233503,-0.076984,0.289665,0.567965,-1.012281,0.13283,-0.218737,...,-0.513139,1.954753,1.551127,0.415481,-0.409104,0.209258,-1.132578,0.399226,-0.038497,-1.489956
1,TTHERM_000000045,-2.448319,-1.697435,-1.587294,-2.257829,-2.508907,-1.154612,-1.036472,-0.642368,-0.132424,...,0.822757,0.991606,0.557714,-0.302687,0.622758,0.557825,-1.174702,0.754409,0.914116,-0.42155
2,TTHERM_00000010,-1.282668,-1.471803,-1.317679,-0.604303,-1.565334,-1.481606,-0.806982,-1.108012,-1.018705,...,-0.493388,0.208015,0.261182,0.203914,0.033621,0.413136,-0.00063,0.365551,0.263487,-0.129336
3,TTHERM_00000020,-0.959173,-1.003962,-0.760488,0.760758,-0.649471,-0.933573,1.921178,-1.089734,-0.79725,...,-0.535369,-0.746171,-0.15452,-0.533887,-0.565776,0.868466,-0.533845,-0.698211,-0.34769,-0.716306
4,TTHERM_00000030,-0.664691,-0.749924,-0.465044,-0.36666,-0.71569,-0.477307,0.267152,-0.762752,-0.589343,...,-0.666575,-0.470984,-0.677744,-0.497797,-0.573679,-0.599182,-0.767357,-0.530738,-0.441231,-0.665373


In [22]:
# sanity check
print('MAX:', max((full_filtered_norm_df.loc[full_filtered_norm_df['TTHERM_ID'] == 'TTHERM_00000010']).values[0][1:]), 'MIN:', min((full_filtered_norm_df.loc[full_filtered_norm_df['TTHERM_ID'] == 'TTHERM_00000010']).values[0][1:]))
print('MEAN:', np.mean((full_filtered_norm_df.loc[full_filtered_norm_df['TTHERM_ID'] == 'TTHERM_00000010']).values[0][1:]), 'STD:', np.std((full_filtered_norm_df.loc[full_filtered_norm_df['TTHERM_ID'] == 'TTHERM_00000010']).values[0][1:]))


MAX: 2.362368225948976 MIN: -1.565334413250945
MEAN: 1.8661195520295185e-16 STD: 1.0


In [13]:
full_filtered_norm_df_target_data = full_filtered_norm_df[list(full_filtered_norm_df.columns)[1:]].values

In [14]:
result, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(
    full_filtered_norm_df_target_data,
    n_neighbors=3,
    random_state=42,
    metric='manhattan',
    return_dists=True 
)

In [15]:
result = coo_matrix(result)

with open('gene_network.pkl', 'wb') as f:
    pickle.dump(result.toarray(), f)

for i in range(len(result.data)):
    row_index = result.row[i]
    col_index = result.col[i]
    element_value = result.data[i]
    
    if element_value != 0 and element_value != 1:
        print(f"Element at ({row_index}, {col_index}): {element_value}")
        break


Element at (0, 2813): 0.5849709510803223


In [16]:
labels = list(full_filtered_norm_df["TTHERM_ID"])
index_to_label = {i: label for i, label in enumerate(labels)}

with open('gene_labels.pkl', 'wb') as f:
    pickle.dump(labels, f)

output_file = "./mcl_rcl/abc_format_graph.txt"

rows, cols = result.nonzero()
values = result.data

with open(output_file, "w") as f:
    for i in range(len(rows)):
        source = rows[i]
        target = cols[i]
        weight = values[i]

        source_label = index_to_label.get(source, str(source))
        target_label = index_to_label.get(target, str(target))

        f.write(f"{source_label} {target_label} {weight}\n")

print(f"Graph data saved to {output_file}")

Graph data saved to ./mcl_rcl/abc_format_graph.txt
