In [63]:
#Import packages
import pandas as pd
import numpy as np
import altair as alt
from pyhpo import Ontology
import itertools
Ontology()

import fastobo 
import matplotlib.pyplot as plt
import networkx as nx

from stellargraph import StellarGraph

from concurrent.futures import ProcessPoolExecutor #For multi threading
from gensim.models import Word2Vec #For Node2Vec
from sklearn.metrics.pairwise import cosine_distances #measure cosine distance
import patient_matching #to get patient matching functions

from stellargraph.data import BiasedRandomWalk

from sklearn.manifold import TSNE


#use block below to instal new packages

# Import Files and Diagnosis File Processing

Import files needed to add String DB associated genes to the 

In [64]:
#Import files for gene matching
gene_data = pd.read_table('./data_files/9606.protein.info.v12.0.txt') #Import String Protein names and IDs
gene_associations = pd.read_table('./data_files/9606.protein.links.detailed.v12.0.txt',delimiter=' ') #Import gene associations
patient_data = pd.read_csv('./data_files/753_UDNDx_Patients_8.26.24.csv') #import patient data

In [65]:
#This block is building gene associations based on String db
#Isolate protein id and preferred names
select_data = gene_data[['#string_protein_id','preferred_name']]
#We will convert this to a dictionary for later ease
gene_dict = dict(zip(gene_data['#string_protein_id'], gene_data['preferred_name'])) # gene_dict is the gene dictionary of interest

#Filter high confidence associations
high_con_associations = gene_associations.loc[gene_associations['combined_score'] >= 700] #Set confidence score to select interactions of interest
#filter data frame
high_con_associations = high_con_associations[['protein1','protein2','combined_score']]
high_con_associations['protein1'] = high_con_associations['protein1'].map(gene_dict)
high_con_associations['protein2'] = high_con_associations['protein2'].map(gene_dict)

association_data = []
high_con_associations = list(high_con_associations.itertuples(index=False, name=None)) #convert high con associations into a list of tuples
i = 0 #initialize i as the index in the table
j = 0 #initialize j as the index in the output dataframe
while i < len(high_con_associations) :
    j = j +1 
    protein = high_con_associations[i][0] #0 is index of protein 1
    protein_list =[]
    num_protein = 0
    while protein == high_con_associations[i][0]:
        num_protein = num_protein + 1 
        associated_protein = (high_con_associations[i][1],high_con_associations[i][2])
        protein_list.append(associated_protein)
        i = i + 1 
        if i == len(high_con_associations):
            break
    association_data.append([protein ,num_protein, protein_list])
association_data = pd.DataFrame(association_data, columns=['Gene', '# Of Associated Genes', 'Associated Genes'])

#For now we will stick to only one gene diagnosis, this loop takes about 0.1 seconds to run and is not a big time problem
#patient_data.rename(columns={'Genes' : 'Gene 1'})
for i in range(len(patient_data)):
    gene = patient_data.loc[i]['Genes']
    if ',' in gene or ';' in gene:
        #print(gene)
        patient_data = patient_data.drop([i])

patient_data = patient_data.rename(columns={'Genes' : "Gene"})
filtered_patient_data = pd.merge(patient_data, association_data, on='Gene')


In [66]:
diagnosis_file = filtered_patient_data #The rest of the script uses diagnosis file 
#HPO terms are saved as a string, convert to a list
diagnosis_file['Terms'] = diagnosis_file['Terms'].apply(lambda x: x.split(';'))

In [67]:
#Import Similarity table
similarity_table = pd.read_csv('./data_files/753_Dx_Individuals_jaccardIC_omim_5.3.24_similarity_table.csv')
similarity_table = similarity_table.rename(columns={'Unnamed: 0' : 'Patient ID/Patient ID'})
similarity_table = similarity_table.set_index('Patient ID/Patient ID')

In [68]:
#Define a list of patients to loop through
patient_list = diagnosis_file['ID'].tolist()

#Drop patients from the similairty table that we are not looking at in the diagnosis file
for i in similarity_table.index:
    if i not in patient_list:
        similarity_table = similarity_table.drop(columns=i).drop([i])

phenotypic_similarity = similarity_table
phenotypic_similarity.index.name = 'ID'

genetic_data = diagnosis_file
genetic_data = genetic_data.set_index('ID')

#We need a matrix wherein we have patient ID in one coloumn and a matrix of genes that are associated with the patient
#Remove confidence scores in associated genes coloumn
diagnosis_file['Associated Genes'] = diagnosis_file['Associated Genes'].apply(lambda lst: [tup[0] for tup in lst])
genetic_data = diagnosis_file[['ID','Associated Genes']]

In [None]:
#View final diagnosis file
diagnosis_file

In [70]:
#Import HPO-Gene associations
HPO_gene_data = pd.read_table('./data_files/genes_to_phenotype.txt')

# Build Network X Graph
We will make a networkX graph first, add all nodes and edges and then convert it to a stellar graph 
Stellar graph does not let us add things one by one and I am a noob and would like to add things one by one for simplicity

In [None]:
#We will set up a new graph


#We need to add the HPO term hierarchy in the model and will do so with code from Isabelle
pato = fastobo.load('./data_files/hp.obo')

#We will make a directed graph or undirected graph depending on what is easeir to integrate
knowledge_graph = nx.MultiGraph()


terms = []
for frame in pato:
    if isinstance(frame, fastobo.term.TermFrame):
        knowledge_graph.add_node(str(frame.id), node_type='HPO Term')
        terms.append(str(frame.id))
        for clause in frame:
            if isinstance(clause, fastobo.term.IsAClause):
                knowledge_graph.add_edge(str(frame.id), str(clause.term))

#Get info
print(nx.info(knowledge_graph))



In [72]:
#Convert high con associations into a dataframe
high_con_associations = pd.DataFrame(high_con_associations, columns=['Protein 1','Protein 2', 'Score'])

In [None]:
#Add all high confidence gene interactions to the graph n

#Add all genes in the string db
genes = gene_data[['preferred_name']]
genes = genes['preferred_name'].astype(str).str.strip() #reformat strings so that they are all correctly recognized as nodes
knowledge_graph.add_nodes_from(genes, node_type = 'Genes')

#Add all high con associations
interactions_list = list(high_con_associations[['Protein 1', 'Protein 2']].itertuples(index=False, name=None))

knowledge_graph.add_edges_from(interactions_list)

#Get info
print(nx.info(knowledge_graph))


In [None]:
#We need to add genes that HPO terms are connected to but are not present in GeneDb
additional_genes = set(HPO_gene_data['gene_symbol'].to_list())
additional_genes = list(set(additional_genes) - set(genes))
print(len(additional_genes), 'genes not in string db')

knowledge_graph.add_nodes_from(additional_genes, node_type = 'Genes' )

#Get info
print(nx.info(knowledge_graph))

In [None]:
#We now need to link HPO terms to genes
HPO_interactions = list(HPO_gene_data[['gene_symbol','hpo_id']].itertuples(index=False, name=None))
knowledge_graph.add_edges_from(HPO_interactions)

#Get info
print(nx.info(knowledge_graph))

In [None]:
#Add patient nodes and add edges for each patient to its HPO terms

knowledge_graph.add_nodes_from(diagnosis_file['ID'].to_list(), node_type='Patient')

#Loop through each patient to add thier edges
for patient in diagnosis_file['ID']:
    term_list = diagnosis_file[diagnosis_file['ID']== patient]['Terms']
    term_list = list(itertools.chain.from_iterable(term_list))

    for term in term_list:
        term = term.strip()
        knowledge_graph.add_edge(patient,term)
    
#Get info
print(nx.info(knowledge_graph))



# Convert networkX graph to a stellar graph and perform random walks

In [None]:
#We will have to convert the diagnosed genes into a matrix to feed it into the graph
#This will be a similar approach to how we made a gene matrix for associated genes previously

stellar_knowledge_graph = StellarGraph.from_networkx(knowledge_graph, node_type_attr='node_type')

print(stellar_knowledge_graph.info())



In [78]:
#We will multiprocess the random walks, they take about 90 minutes on a single thread


rw = BiasedRandomWalk(stellar_knowledge_graph)

# Define a function that will run walks on a chunk of nodes
def run_walks_on_chunk(nodes_chunk):
    return rw.run(nodes=nodes_chunk, n=10, length=5, p=1, q=1)

# Get the list of all nodes in the graph
all_nodes = list(stellar_knowledge_graph.nodes())

# Set the number of workers (threads) you want
num_workers = 10

# Split the list of nodes into chunks, one for each worker
node_chunks = [all_nodes[i::num_workers] for i in range(num_workers)]

# Run the walks in parallel using ThreadPoolExecutor
with ProcessPoolExecutor(max_workers=num_workers) as executor:
    # Map the node chunks to the walk function
    walks_results = list(executor.map(run_walks_on_chunk, node_chunks))

# Combine the results from all threads
walks = [walk for result in walks_results for walk in result]


In [81]:

str_walks = [[str(n) for n in walk] for walk in walks]
model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=10, iter=1) #sh=0 might be more appropriate

In [82]:
# Retrieve node embeddings and corresponding subjects
node_ids = model.wv.index2word  # list of node IDs
node_embeddings = (
    model.wv.vectors
) 

# Apply t-SNE transformation on node embeddings
tsne = TSNE(n_components=2)
node_embeddings_2d = tsne.fit_transform(node_embeddings)


In [83]:
node_types = [stellar_knowledge_graph.node_type(node) for node in node_ids]

nodes_data = pd.DataFrame({'Node':node_ids, 'node_type':node_types,'Position 1':node_embeddings_2d[:,0], 'Position 2':node_embeddings_2d[:,1]})
nodes_data = nodes_data.set_index('Node')

In [None]:
nodes_data[nodes_data['node_type']== 'Genes']

In [85]:
#Save embeddings
#np.save('node_embeddings.npy', node_embeddings)
#Save node data TSNE points as a dataframe
#nodes_data.to_csv('nodes_data_tsne.csv')

# Import embeddings and TSNE points


In [86]:
#node_embeddings = np.load('node_embeddings.npy')
#nodes_data = pd.read_csv('nodes_data_tsne.csv')

#node_embeddings_2d = nodes_data[['Position 1', 'Position 2']].to_numpy()
#node_ids = nodes_data['Node']
#node_types = [stellar_knowledge_graph.node_type(node) for node in node_ids]

In [None]:
node_embeddings

# Make TSNE Graphs

In [None]:
# Draw a TSNE Graph
alpha = 0.3 #Determines transparecy of points

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    cmap="jet",
    alpha=alpha,
)


In [None]:
# Plot RFC 1 Patinet HPO Terms
alpha = 0.3

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    cmap="jet",
    alpha=alpha,
)

#Grab a patients HPO terms and visualize them on the scatter plot
diagnosis_file
RFC1_patients = diagnosis_file[diagnosis_file['Gene']=='RFC1']
RFC1_patients = RFC1_patients['Terms']
RFC1_patients = RFC1_patients.apply(lambda terms: [term.strip() for term in terms])

rows_to_plot = nodes_data.loc[RFC1_patients[1]]
rows_to_plot
plt.scatter(rows_to_plot['Position 1'], rows_to_plot['Position 2'], color='red')

rows_to_plot_2 = nodes_data.loc[RFC1_patients[2]]
plt.scatter(rows_to_plot_2['Position 1'], rows_to_plot_2['Position 2'], color='green')

rows_to_plot_3 = nodes_data.loc[RFC1_patients[3]]
plt.scatter(rows_to_plot_3['Position 1'], rows_to_plot_3['Position 2'], color='purple')

plt.show()

In [None]:
# PLOT RFC1 Patients
alpha = 0.3

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    cmap="jet",
    alpha=alpha,
)

RFC1_patients = diagnosis_file[diagnosis_file['Gene']=='RFC1']
RFC1_patients = RFC1_patients['ID']
RFC1_patients

patient_1 = nodes_data.loc[RFC1_patients[1]]
plt.scatter(patient_1['Position 1'], patient_1['Position 2'], color='red')

patient_2 = nodes_data.loc[RFC1_patients[2]]
plt.scatter(patient_2['Position 1'], patient_2['Position 2'], color='orange')

patient_3 = nodes_data.loc[RFC1_patients[3]]
plt.scatter(patient_3['Position 1'], patient_3['Position 2'], color='purple')

In [None]:
#Plot All Patinet Nodes
alpha = 0.3

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    cmap="jet",
    alpha=alpha,
)

#Try plotting only patient nodes
patient_nodes = nodes_data[nodes_data['node_type']=='Patient']
patient_nodes
plt.scatter(patient_nodes['Position 1'], patient_nodes['Position 2'], color='red', alpha = 0.3)



In [None]:
#Plot Patinet nodes for all genes that have morr than 3 patients with the same gene diagnosis

gene_diag = diagnosis_file[['ID','Gene']]
gene_diag = gene_diag.set_index('ID')
patient_nodes = pd.merge(patient_nodes,gene_diag, left_index=True, right_index=True, how='inner')

patient_nodes = patient_nodes.groupby('Gene').filter(lambda x: len(x) > 3)

# Get unique diagnoses
unique_diagnoses = patient_nodes['Gene'].unique()

# Create a color map with a number of colors equal to the number of unique diagnoses
colors = plt.cm.inferno_r(np.linspace(0, 1, len(unique_diagnoses)))

# Create a mapping from diagnosis to color
diagnosis_color_map = dict(zip(unique_diagnoses, colors))

# draw the points
alpha = 0.3

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    cmap="jet",
    alpha=alpha,
)


# Plot each diagnosis with its corresponding color
for diagnosis in unique_diagnoses:
    subset = patient_nodes[patient_nodes['Gene'] == diagnosis]
    plt.scatter(subset['Position 1'], subset['Position 2'], label=diagnosis, alpha=1)
plt.legend()
plt.show()

In [None]:
# Plot Patient Nodes in black and the HPO terms for each patient in different colors for a given gene diagnosis
alpha = 0.1

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    cmap="jet",
    alpha=alpha,
)

#Grab a patients HPO terms and visualize them on the scatter plot
diagnosis_file
gene_patients = diagnosis_file[diagnosis_file['Gene']=='CDKL5'] #This line sets the Gene name
patients_terms = gene_patients['Terms']
patients_nodes = gene_patients['ID']
patients_terms = patients_terms.apply(lambda terms: [term.strip() for term in terms])

for i in patients_terms.index:
    rows_to_plot = nodes_data.loc[patients_terms[i]]
    nodes_to_plot = nodes_data.loc[patients_nodes[i]]
    plt.scatter(rows_to_plot['Position 1'], rows_to_plot['Position 2'], alpha = 0.7)
    plt.scatter(nodes_to_plot['Position 1'], nodes_to_plot['Position 2'], color='black')

plt.show()

# Measure Cosine distance between patients and make a matrix between each pair of patients


In [None]:
#Prepare data for calculating Cosine distances
embeddings_data = pd.DataFrame({'Node':node_ids, 'node_type':node_types,'Position':[list(position) for position in node_embeddings]})
embeddings_data = embeddings_data[embeddings_data['node_type']=='Patient']
embeddings_data

In [None]:

#Make a matrix of each patients and their cosine distances to other patients
embeddings_array = np.array(embeddings_data['Position'].tolist())

distance_matrix = cosine_distances(embeddings_array)

embeddings_matrix = pd.DataFrame(distance_matrix, index=embeddings_data['Node'], columns=embeddings_data['Node'])

embeddings_matrix

In [96]:
#Embeddings Matrix is very similar to similairty matrix so we can build the same powercurves 
#We cant use the function as is though since it sorts in ascending order and lower numbers here represent greater similairty

#Function for primary level matching only
def patient_matching_embed (diagnosis_file:pd.DataFrame, similarity_table:pd.DataFrame) -> None:
    if not isinstance (similarity_table, pd.DataFrame) :
        raise TypeError("Expected a pandas DataFrame")
    
    #Since some patients have the same diagnosis, we will create two dictionaries
    #One with patient IDs and gene diagnosis and the other with patinet ID and associated genes
    #Keep in mind that we will have to loop in the similarity table by patient ID not number since that has been changed by the deletion of some patients 
    #with multiple diagnosed genes

    #I think it will be a good idea to add an index to the similarity table 
    diagnosis_dict = diagnosis_file[['ID','Gene']].to_dict(orient='records')
    diagnosis_dict = {item['ID'] : item['Gene'] for item in diagnosis_dict}


    #Make a patient ID and associated gene disctionary
    associated_genes_dict = diagnosis_file[['ID','Associated Proteins']].to_dict(orient='records')
    associated_genes_dict = {item['ID'] : item['Associated Proteins'] for item in associated_genes_dict}

    #Define a list of patients to loop through
    patient_list = diagnosis_file['ID'].tolist()
    #patient_list = patient_list[ : 50] #shorten list for now
    j = 0
    #drop patients with multiple diagnosis in the similarity table
    #675 patients with a single gene diagnosis
    matching_data = [] # We will do a list of tuples instead otherwise the dataframe will take ages to populate
    k = 0 #Use k as the index in matching data

  
    for i in similarity_table.index:
        if i not in patient_list:
            #Drop row and coloumn of every patient id that is not in the patient list
            similarity_table = similarity_table.drop(columns=i).drop([i])


    match_table = similarity_table.copy()
    match_table = match_table.astype(str)

    for i in patient_list:
        patient_diagnosis = diagnosis_dict[i] #get gene diagnosis
        patient_interacting_genes = associated_genes_dict[i] #get associated genes
        sim_table_subset = similarity_table[i][:] #filter out the list of patients from the similairty matrix
        sim_table_subset = sim_table_subset.drop([i]) #drop the patinet themsevels since their value will be 1
        sim_table_subset = sim_table_subset.sort_values(ascending=True) #put values in decending order such that 1 is higher match and 674 is lowest

        patient2_list = sim_table_subset.index.tolist() #Get the IDs of other patients in decending order 
        #patient2_scores = sim_table_subset.values.tolist() #Get the scores of other patinets in decending order, we will just vaalues from sim_table_subset
        rank = 1
        for j in patient2_list:

            patient2_id = j
            patient2_interacting_genes = associated_genes_dict[j]
            patient2_diagnosis = diagnosis_dict[j]
            patient2_score = sim_table_subset[j]

        
            #We will only consider same gene and primary interactor matches for this work    
            if patient2_diagnosis == patient_diagnosis:
                match_table.loc[i , j] = 'same_gene'
                gene_class = 'same_gene'
            elif patient2_diagnosis in patient_interacting_genes:
                match_table.loc[i , j] = 'interacting_gene'
                gene_class = 'interacting_gene'
            else:
                match_table.loc[i , j] = 'no_match'
                gene_class = 'no_match'
            matching_data.append((i , j , patient2_score, gene_class, rank))
            k = k + 1
            rank = rank + 1

    #Convert matching data to a dataframe
    matching_data = pd.DataFrame(matching_data, columns=['Patient 1', 'Patient 2', 'Score', 'Match Class', 'Rank'])

    #Drop the permutations of patient matches. For example UDN234 - UDN344 is the same as UDN344 and UDN234
    matching_data['sorted'] = matching_data.apply(lambda row: tuple(sorted([row['Patient 1'], row['Patient 2']])), axis = 1)
    # Drop duplicates based on the sorted column
    sorted_match_data = matching_data.drop_duplicates(subset='sorted')
    # Drop the temporary 'sorted' column
    sorted_match_data = sorted_match_data.drop(columns='sorted')
    return matching_data, sorted_match_data

diagnosis_file = diagnosis_file.rename(columns={'Associated Genes':'Associated Proteins'})
matching_data, sorted_match_data = patient_matching_embed(diagnosis_file, embeddings_matrix)

In [None]:
#Plot Cosine distanve relative to CDF
def pheno_score_powercurve(matching_data):
    # Group by PatientID and MatchType, get the highest rank using idxmin
    idx = matching_data.groupby(['Patient 1', 'Match Class'])['Rank'].idxmin()

    # Filter the DataFrame to get only the highest rank for each patient and match type
    highest_rank_df = matching_data.loc[idx]

    dfs_by_match_type = {}
    for match_type in highest_rank_df['Match Class'].unique():
        # Filter by MatchType and store in the dictionary
        dfs_by_match_type[match_type] = highest_rank_df[highest_rank_df['Match Class'] == match_type]
    same_gene_ranks = dfs_by_match_type['same_gene'].sort_values(by='Score',ascending=False)
    interacting_gene_ranks = dfs_by_match_type['interacting_gene'].sort_values(by='Score',ascending=False)

    interacting_gene_patients = interacting_gene_ranks[~interacting_gene_ranks['Patient 1'].isin(same_gene_ranks['Patient 1'])]

    interacting_gene_patients['Match Class'] = 'interating_gene_only'

    #Concatenate Dataframes:
    combined_ranks = pd.concat([same_gene_ranks,interacting_gene_ranks])
    combined_ranks = combined_ranks.sort_values(['Match Class', 'Score'],ascending=False)

    # Step 4: Compute the cumulative proportion (CDF) for each group
    combined_ranks['CDF'] =combined_ranks.groupby('Match Class')['Score'].rank(method='max', pct=True)


    # Step 3: Create the CDF plot using Altair
    cdf_plot = alt.Chart(combined_ranks).mark_line().encode(
        x=alt.X('Score:Q', scale=alt.Scale(reverse=False),axis=alt.Axis(title='Cosine Distance')),    
        y=alt.Y('CDF:Q'),
        color='Match Class:N'  # Use 'Group' to differentiate the lines
    ).properties(
        title='Cumulative Distribution Function (CDF) by Group',
        width=500,
        height=300
    )
    return cdf_plot

cdf_plot = pheno_score_powercurve(matching_data)
cdf_plot

In [None]:
def rank_powercurve (matching_data, graph_color):
    #Plot the cosine matched patients based on Rank 
    # Group by PatientID and MatchType, get the highest rank using idxmin
    idx = matching_data.groupby(['Patient 1', 'Match Class'])['Rank'].idxmin()

    # Filter the DataFrame to get only the highest rank for each patient and match type
    highest_rank_df = matching_data.loc[idx]

    dfs_by_match_type = {}
    for match_type in highest_rank_df['Match Class'].unique():
        # Filter by MatchType and store in the dictionary
        dfs_by_match_type[match_type] = highest_rank_df[highest_rank_df['Match Class'] == match_type]
    same_gene_ranks = dfs_by_match_type['same_gene'].sort_values(by='Score',ascending=False)
    interacting_gene_ranks = dfs_by_match_type['interacting_gene'].sort_values(by='Score',ascending=False)

    #Concatenate Dataframes:
    combined_ranks = pd.concat([same_gene_ranks])
    combined_ranks = combined_ranks.sort_values(['Match Class', 'Score'],ascending=False)

    # Step 4: Compute the cumulative proportion (CDF) for each group
    combined_ranks['CDF'] =combined_ranks.groupby('Match Class')['Rank'].rank(method='max', pct=True)



    # Step 3: Create the CDF plot using Altair
    cdf_plot = alt.Chart(combined_ranks).mark_line().encode(
        x=alt.X('Rank:Q', scale=alt.Scale(reverse=False),axis=alt.Axis(title='Rank')),    
        y=alt.Y('CDF:Q'),
        color = alt.ColorValue(graph_color),
    ).properties(
        title='Cumulative Distribution Function (CDF) by Group',
        width=500,
        height=300
    )
    return cdf_plot

cdf_plot = rank_powercurve(matching_data, 'purple')
cdf_plot

In [99]:
#Finally, we need to compare cosine matching to phenotype based matching

#Perform patient matching based on the similarity matrix based on phenotype score 
matching_data_by_phenotype, _, _ = patient_matching.patient_matching(diagnosis_file, similarity_table)

In [None]:
cdf_plot_2 = rank_powercurve(matching_data_by_phenotype, 'blue')
cdf_plot_2

In [None]:
cdf_plot_3 = alt.layer(cdf_plot + cdf_plot_2)
cdf_plot_3

# Identify Weird cluster

In [None]:
nodes_data

# Try matching with the full dataset

In [None]:
#This block bugs out
patient_data = pd.read_csv('./data_files/753_UDNDx_Patients_8.26.24.csv') #reimport patient data

patient_data = patient_data.rename(columns={'Genes' : "Gene"})
filtered_patient_data = pd.merge(patient_data, association_data, on='Gene', how='left')
filtered_patient_data=filtered_patient_data['Associated Genes'].fillna('')

diagnosis_file = filtered_patient_data #The rest of the script uses diagnosis file 
#HPO terms are saved as a string, convert to a list
#['Terms'] = diagnosis_file['Terms'].apply(lambda x: x.split(';'))


#Drop patients from the similairty table that we are not looking at in the diagnosis file
for i in similarity_table.index:
    if i not in patient_list:
        similarity_table = similarity_table.drop(columns=i).drop([i])

phenotypic_similarity = similarity_table
phenotypic_similarity.index.name = 'ID'


#We need a matrix wherein we have patient ID in one coloumn and a matrix of genes that are associated with the patient
#Remove confidence scores in associated genes coloumn
diagnosis_file['Associated Genes'] = diagnosis_file['Associated Genes'].apply(
    lambda lst: [tup[0] for tup in lst] if isinstance(lst, list) and len(lst) > 0 else lst
)


diagnosis_file = diagnosis_file.rename(columns={'Associated Genes':'Associated Proteins'})


In [None]:
patient_data

In [49]:
#Perform phenotype based matching
matching_data_by_phenotype, _, _ = patient_matching.patient_matching(diagnosis_file, similarity_table)

In [51]:
matching_data, sorted_match_data = patient_matching_embed(diagnosis_file, embeddings_matrix)

In [None]:
cdf_plot = rank_powercurve(matching_data, 'purple')
cdf_plot_2 = rank_powercurve(matching_data_by_phenotype, 'blue')
cdf_plot_3 = alt.layer(cdf_plot + cdf_plot_2)
cdf_plot_3

In [66]:
patient_data = pd.read_csv('753_UDNDx_Patients_8.26.24.csv') #reimport patient data


patient_data = patient_data.rename(columns={'Genes' : "Gene"})
filtered_patient_data = pd.merge(patient_data, association_data, on='Gene', how='left')


In [None]:
filtered_patient_data