In [1]:
import pandas as pd
import numpy as np
import umap
from scipy.sparse import coo_matrix
# import scipy.sparse
# import igraph as ig

In [2]:
def normalizer(array):
    """
    Normalizes the values of an array to range from zero to one
    """
    
    a = np.array(array)
    
    normalized = (array - np.min(array)) / (np.max(array) - np.min(array))
    
    return normalized

def normalize_expression_per_gene(expression_df):
    """
    Function to normalize all gene expression to range from zero to one.
    """
    if 'TTHERM_ID' in expression_df.columns:
        ttids = expression_df['TTHERM_ID'].values
        data = expression_df[list(expression_df.columns)[1:]]
        
        norm_expression_df = data.apply(lambda row: normalizer(row), axis=1)
        norm_expression_df['TTHERM_ID'] = ttids
        
        columns = norm_expression_df.columns.tolist()
        
        rearrangment = columns[-1:] + columns[:-1]
        
        norm_expression_df = norm_expression_df[rearrangment]
        
    else:
        norm_expression_df = expression_df.apply(lambda row: normalizer(row), axis=1)
    
    return norm_expression_df

In [3]:
full_filtered_df = pd.read_csv('../microarray_probe_alignment_and_filtering/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
full_filtered_df = full_filtered_df.rename(columns={'Unnamed: 0': 'TTHERM_ID'})
full_filtered_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_000000045,9.633489,9.977124,10.027529,9.720665,9.605762,10.225542,10.279608,10.459966,10.693337,...,11.130466,11.207738,11.009172,10.615417,11.038938,11.009222,10.216348,11.099187,11.172276,10.561021
1,TTHERM_00000010,5.066343,4.767264,5.010981,6.139047,4.619361,4.751761,5.81855,5.342529,5.48375,...,6.314438,7.423571,7.507645,7.417087,7.147801,7.74793,7.093641,7.672685,7.51129,6.890117
2,TTHERM_00000020,4.696881,4.638401,4.956299,6.942556,5.101252,4.730307,8.45769,4.526411,4.9083,...,5.250233,4.974993,5.747498,5.252167,5.210531,7.083187,5.252222,5.037613,5.495281,5.013987
3,TTHERM_00000030,4.654278,4.537105,4.928739,5.063991,4.584168,4.91188,5.935311,4.51947,4.757861,...,4.651688,4.920573,4.636333,4.883712,4.779395,4.744335,4.51314,4.838428,4.961475,4.65334
4,TTHERM_00000070,4.758227,6.032492,6.866545,5.606154,6.099037,6.082681,7.238753,7.706041,7.578804,...,6.604102,5.286292,5.868084,5.358154,5.671038,6.591834,6.684315,7.569773,7.417419,6.488644


In [4]:
full_filtered_norm_df = normalize_expression_per_gene(full_filtered_df)

In [5]:
full_filtered_norm_df_target_data = full_filtered_norm_df[list(full_filtered_norm_df.columns)[1:]].values

In [6]:
result, sigmas, rhos, dists = umap.umap_.fuzzy_simplicial_set(
    full_filtered_norm_df_target_data,
    n_neighbors=3,
    random_state=42,
    metric='manhattan',
    return_dists=True 
)

In [7]:
result = coo_matrix(result)

for i in range(len(result.data)):
    row_index = result.row[i]
    col_index = result.col[i]
    element_value = result.data[i]
    
    if element_value != 0 and element_value != 1:
        print(f"Element at ({row_index}, {col_index}): {element_value}")
        break


Element at (0, 4373): 0.5849625468254089


In [8]:
labels = list(full_filtered_norm_df["TTHERM_ID"])
index_to_label = {i: label for i, label in enumerate(labels)}

output_file = "./rcl_mcl/abc_format_graph.txt"

rows, cols = result.nonzero()
values = result.data

with open(output_file, "w") as f:
    for i in range(len(rows)):
        source = rows[i]
        target = cols[i]
        weight = values[i]

        source_label = index_to_label.get(source, str(source))
        target_label = index_to_label.get(target, str(target))

        f.write(f"{source_label} {target_label} {weight}\n")

print(f"Graph data saved to {output_file}")

Graph data saved to ./rcl_mcl/abc_format_graph.txt
