In [1]:
import json
from oaklib import get_adapter
import pandas as pd

adapter = get_adapter("sqlite:obo:apo")

# Path to the file
sgd_raw_data_path = '../data/PHENOTYPE_SGD.json'
sgd_dosdp_path = '../data/sgd_dosdp.tsv'
sgd_pato_mapping_path = '../data/sgd_pato.sssom.tsv'


df_sgd_dosdp = pd.read_csv(sgd_dosdp_path, sep='\t')
df_sgd_pato_mapping = pd.read_csv(sgd_pato_mapping_path, sep='\t')

# Read and parse the JSON file
try:
    with open(sgd_raw_data_path, 'r') as file:
        json_data = json.load(file)
except FileNotFoundError:
    print(f"File not found: {sgd_raw_data_path}")
except json.JSONDecodeError:
    print("Error decoding JSON from the file")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
 # Extract phenotypeTermIdentifiers
def get_mapped_pato_id(phenotype_term_id_order1, df_sgd_pato_mapping):
    pato_id = df_sgd_pato_mapping[df_sgd_pato_mapping['subject_id'] == phenotype_term_id_order1]['object_id'].iloc[0]
    pato_label = df_sgd_pato_mapping[df_sgd_pato_mapping['subject_id'] == phenotype_term_id_order1]['object_label'].iloc[0]
    return pato_id, pato_label

data = []

for item in json_data["data"]:
   phenotype_term_id_order1 = None
   phenotype_term_id_order2 = None
   phenotype_term_id_order1_label = None
   phenotype_term_id_order2_label = None

   if "phenotypeTermIdentifiers" in item:
      for pheno_id in item["phenotypeTermIdentifiers"]:
         
         if pheno_id['termOrder']==1:
            if phenotype_term_id_order1:
               raise ValueError(f"Phenotype description has unexpected term item: {pheno_id['termOrder']} ({pheno_id})")
            phenotype_term_id_order1 = pheno_id['termId']
            if not phenotype_term_id_order1:
               raise ValueError(f"Phenotype description has unexpected term order 1 item: {pheno_id}")
            phenotype_term_id_order1_label = adapter.label(phenotype_term_id_order1)
         elif pheno_id['termOrder']==2:
            if phenotype_term_id_order2:
               raise ValueError(f"Phenotype description has unexpected term item: {pheno_id['termOrder']} ({pheno_id})")
            phenotype_term_id_order2 = pheno_id['termId']
            if not phenotype_term_id_order2:
               raise ValueError(f"Phenotype description has unexpected term order 2 item: {pheno_id}")
            phenotype_term_id_order2_label = adapter.label(phenotype_term_id_order2)
         else:
               raise ValueError(f"Phenotype description has unexpected order: {pheno_id['termOrder']} ({pheno_id})")
   
   pato_id, pato_label = get_mapped_pato_id(phenotype_term_id_order1, df_sgd_pato_mapping)
   data.append([pato_id, pato_label, phenotype_term_id_order1, phenotype_term_id_order1_label, phenotype_term_id_order2, phenotype_term_id_order2_label])

df = pd.DataFrame.from_records(data, columns=["pato_id", "pato_id_name", "original_id", "original_label", "affected_entity_1_super", "affected_entity_1_super_name"])

df.head()

df.to_csv('sgd_phenotype.csv', index=False)

# NEXT TIME WORK ON ADDING THE GENERATED IDS TO DOSDP (TAKE CODE FROM KEVIN, RFACTOR)
# NEXT TIME BUILD DOSDP GENERATE PIPELINE TO CREATE THE OWL - ensure tha


Unnamed: 0,pato_id,pato_id_name,original_id,original_label,affected_entity_1_super,affected_entity_1_super_name
0,PATO:0002301,decreased quality,APO:0000003,decreased,APO:0000087,resistance to chemicals
1,PATO:0002301,decreased quality,APO:0000003,decreased,APO:0000087,resistance to chemicals
2,PATO:0002301,decreased quality,APO:0000003,decreased,APO:0000110,competitive fitness
3,PATO:0002301,decreased quality,APO:0000003,decreased,APO:0000110,competitive fitness
4,PATO:0002301,decreased quality,APO:0000003,decreased,APO:0000095,chemical compound accumulation


In [9]:


phenotype_term_id_order1="APO:0000003"

pato_id = get_mapped_pato_id(phenotype_term_id_order1, df_sgd_pato_mapping)
pato_id

0    PATO:0002301
Name: object_id, dtype: object

In [4]:
df_dedup = df.drop_duplicates()
df_dedup.to_csv("../data/sgd_dosdp.tsv",sep="\t", index=None)

df_pato = df[['pato_id', 'pato_id_name']].drop_duplicates()
df_pato.to_csv("../data/sgd_modifiers.tsv",sep="\t", index=None)

df_entities = df[['affected_entity_1_super', 'affected_entity_1_super_name']].drop_duplicates()
df_entities.to_csv("../data/sgd_entities.tsv",sep="\t", index=None)

# NOTE we need more thinking on how to patternise SGD, as they use "normal", "abnormal" etc a bit differently then uPheno.


NameError: name 'df' is not defined