In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df_automaxo_annots = pd.read_csv("automaxo_annots.tsv", delimiter='\t')

In [3]:
df_automaxo_annots.head()

Unnamed: 0,disease_id,disease_name,source_id,maxo_id,maxo_name,hpo_id,relation,evidence,extension_id,extension_name,comment,other,author,last_updated,created
0,MONDO:0010200,Wilson disease,PMID:10378366,MAXO:0001175,liver transplantation,HP:0004448,treats,PCS,,,,,0000-0002-0736-9199,,2024-07-22
1,MONDO:0010200,Wilson disease,PMID:10604583,MAXO:0000602,hemodialysis,HP:0032254,treats,PCS,,,,,0000-0002-0736-9199,,2024-07-22
2,MONDO:0010200,Wilson disease,PMID:12760731,MAXO:0001224,copper chelator agent therapy,HP:0006554,treats,PCS,,,,,0000-0002-0736-9199,,2024-07-22
3,MONDO:0010200,Wilson disease,PMID:15701295,MAXO:0009095,zinc therapy,HP:0000118,treats,PCS,,,,,0000-0002-0736-9199,,2024-07-22
4,MONDO:0010200,Wilson disease,PMID:15915361,MAXO:0001175,liver transplantation,HP:0001399,treats,PCS,,,,,0000-0002-0736-9199,,2024-07-22


In [4]:
df_automaxo_annots.columns

Index(['disease_id', 'disease_name', 'source_id', 'maxo_id', 'maxo_name',
       'hpo_id', 'relation', 'evidence', 'extension_id', 'extension_name',
       'comment', 'other', 'author', 'last_updated', 'created'],
      dtype='object')

In [5]:
df_automaxo_annots["disease_name"].value_counts()

disease_name
sickle cell anemia                     117
celiac disease                          67
Wilson disease                          48
Huntington disease                      46
Marfan syndrome                         44
Noonan syndrome                         31
Apert syndrome                          24
propionic acidemia                      22
Lesch-Nyhan syndrome                    21
Loeys-Dietz syndrome                    20
Chediak-Higashi syndrome                17
citrullinemia, type II, adult-onset     14
alkaptonuria                            13
Stickler syndrome                       11
Camurati-Engelmann disease              10
Brugada syndrome                        10
Canavan disease                          8
Donnai-Barrow syndrome                   8
hypochondroplasia                        4
Achondroplasia                           2
lymphatic malformation 1                 1
Name: count, dtype: int64

In [6]:
len(df_automaxo_annots["disease_id"].unique())

21

In [7]:
len(df_automaxo_annots["source_id"].unique())

492

In [8]:
df_automaxo_annots["source_id"]

0      PMID:10378366
1      PMID:10604583
2      PMID:12760731
3      PMID:15701295
4      PMID:15915361
           ...      
533    PMID:29889773
534    PMID:30513314
535    PMID:31842932
536    PMID:32164578
537    PMID:35095068
Name: source_id, Length: 538, dtype: object

In [9]:
## Step 1: * Create a dictionary where the keys are the selected PMIDs

# Extract the unique values from the source_id column
unique_source_ids = df_automaxo_annots["source_id"].unique()

# Create a dictionary with the numerical part of each source_id as keys
source_id_dict = {source_id.split(":")[1]: None for source_id in unique_source_ids}



In [10]:
len(source_id_dict.keys())

492

In [11]:
source_id_dict

{'10378366': None,
 '10604583': None,
 '12760731': None,
 '15701295': None,
 '15915361': None,
 '16996405': None,
 '17285615': None,
 '17382611': None,
 '17461475': None,
 '19066958': None,
 '19595231': None,
 '20946468': None,
 '20955957': None,
 '20958917': None,
 '22261259': None,
 '22720274': None,
 '23109455': None,
 '23289267': None,
 '24118554': None,
 '24166573': None,
 '26720766': None,
 '27350316': None,
 '27915967': None,
 '28260463': None,
 '28433103': None,
 '28433104': None,
 '28433106': None,
 '28987261': None,
 '29449431': None,
 '30357869': None,
 '30359967': None,
 '30384011': None,
 '31113589': None,
 '31878905': None,
 '32008504': None,
 '32291276': None,
 '32398357': None,
 '32520831': None,
 '32745371': None,
 '33369596': None,
 '33541020': None,
 '33541028': None,
 '35042319': None,
 '36343759': None,
 '38695602': None,
 '7878329': None,
 '8723326': None,
 '9544415': None,
 '21756462': None,
 '32657950': None,
 '10685567': None,
 '11452173': None,
 '15660289': No

In [12]:
## Step 2: Pass through all the directories that has extracted articles and selected those in the dictionary 
# Initialize an empty DataFrame to store the results
combined_filtered_df = pd.DataFrame()

# Path to the main data directory
data_directory = "../data"

# Convert source_id_dict keys to a set for efficient lookup
source_ids_set = set(source_id_dict.keys())

# Traverse through each disease folder in the data directory
for disease_folder in os.listdir(data_directory):
    # Construct the path to the disease folder
    disease_folder_path = os.path.join(data_directory, disease_folder)
    
    # Check if the path is indeed a directory
    if os.path.isdir(disease_folder_path):
        # Look for the file that ends with '_no_replaced.tsv'
        for file_name in os.listdir(disease_folder_path):
            if file_name.endswith('_no_replaced.tsv'):
                # Construct the full file path
                file_path = os.path.join(disease_folder_path, file_name)
                
                # Read the TSV file into a DataFrame
                df_disease_non_replaced = pd.read_csv(file_path, delimiter='\t')
                
                # Filter the DataFrame to include only rows where the PMID is in source_ids_set
                filtered_df = df_disease_non_replaced[df_disease_non_replaced['PMID'].astype(str).isin(source_ids_set)]
                
                # Append the filtered DataFrame to the combined DataFrame
                combined_filtered_df = pd.concat([combined_filtered_df, filtered_df], ignore_index=True)



In [13]:
# Display the combined filtered DataFrame
print(combined_filtered_df.head())


       PMID                                              Title  \
0  21992158  Treatment of chronic chest wall pain in a pati...   
1  24045763  A patient with Loeys-Dietz syndrome treated wi...   
2  25203753  Hemodynamic instability during anesthesia in a...   
3  29889773  Spondylolisthesis is Common, Early, and Severe...   
4  23350955  Imaging and percutaneous occlusion of a large ...   

                                            Abstract  
0  OBJECTIVE: Spinal cord stimulation (SCS) has b...  
1  We present the first published case of a patie...  
2  We present a case of a 12-year-old male with L...  
3  BACKGROUND: We studied the prevalence and trea...  
4  Loeys-Dietz is a multisystem congenital syndro...  


In [14]:
len(combined_filtered_df)

493

In [15]:
combined_filtered_df

Unnamed: 0,PMID,Title,Abstract
0,21992158,Treatment of chronic chest wall pain in a pati...,OBJECTIVE: Spinal cord stimulation (SCS) has b...
1,24045763,A patient with Loeys-Dietz syndrome treated wi...,We present the first published case of a patie...
2,25203753,Hemodynamic instability during anesthesia in a...,We present a case of a 12-year-old male with L...
3,29889773,"Spondylolisthesis is Common, Early, and Severe...",BACKGROUND: We studied the prevalence and trea...
4,23350955,Imaging and percutaneous occlusion of a large ...,Loeys-Dietz is a multisystem congenital syndro...
...,...,...,...
488,31856005,Stickler Syndrome: Airway Complications in a C...,BACKGROUND: Patients with Stickler syndrome of...
489,20545219,[Stickler syndrome with rhegmatogenous retinal...,BACKGROUND: Stickler syndrome is an autosomal ...
490,32536504,Surgical management in a severe OSA patient di...,Stickler syndrome is a genetic disorder of con...
491,26540157,Use of External Distractors and the Role of Im...,IMPORTANCE: Computed tomographic (CT) scans ar...


In [16]:

# Optionally, save the combined DataFrame to a new file
combined_filtered_df.to_csv("combined_filtered_df.tsv", sep='\t', index=False)
