In [None]:
import pandas as pd
from rdflib import Graph, RDF, Namespace

In [None]:
# Merge files
df1 = pd.read_csv('https://raw.githubusercontent.com/nih-cfde/ReproToxTables/main/Susceptibility%20Scores%20and%20GWAS%20Gene%20Lists/GWAS%20Phenotype%20Gene/Text%20Mining-GWAS-CNS%20Disease%20Phenotype%20Gene%20Associations.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/nih-cfde/ReproToxTables/main/Susceptibility%20Scores%20and%20GWAS%20Gene%20Lists/GWAS%20Phenotype%20Gene/Text%20Mining-GWAS-Great%20Vessels%20Disease%20Phenotype%20Gene%20Associations.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/nih-cfde/ReproToxTables/main/Susceptibility%20Scores%20and%20GWAS%20Gene%20Lists/GWAS%20Phenotype%20Gene/Text%20Mining-GWAS-Heart%20Disease%20Phenotype%20Gene%20Associations.csv')
df_c = pd.concat([df1, df2, df3])

In [None]:
df_c = pd.concat([df1, df2, df3])
df_c['HPO Term'] = df_c['HPO Term'].str.capitalize()
df_c

In [None]:
# Extract HP to disease mapping
df_dis = df_c[['HPO Term', 'HPO Accession']].drop_duplicates().to_csv('hpo_mappings.tsv', sep='\t', index=False)

In [None]:
# Aggregate genes by a disease
df_c = df_c[['HPO Term', 'Gene Symbol']].groupby(['HPO Term']).agg({'Gene Symbol': '\t'.join})
df_c.reset_index(inplace=True)
df_c['HPO Term'] = df_c['HPO Term'].str.capitalize()
df_c.to_csv('Text_Mining_GWAS.gmt', sep='\t', index=False)

In [None]:
# Merge hpo_mappings.tsv into defects_hpo_mappings.tsv and de-duplicate manually 
defects_map = pd.read_csv('defects_hpo_mappings.tsv', sep='\t', index_col=0)

In [None]:
g = Graph()

disease = Namespace("https://www.orpha.net/ORDO/")
phenotype = Namespace("https://purl.obolibrary.org/obo/")
relationship = Namespace("https://semanticscience.org/resource/")
gene_symbol = Namespace("https://identifiers.org/hgnc.symbol/")

In [None]:
g.bind('gene_symbol', gene_symbol)
g.bind('disease', disease)
g.bind('relationship', relationship)
g.bind('phenotype', phenotype)

In [None]:
with open('Text_Mining_GWAS.gmt', 'r') as f: 
    for l in f.readlines(): 
        p = l.split('\t')[0].strip()
        hp_id = defects_map.loc[p, 'hp']
        if hp_id.startswith('ORPHA'):
            hp_id = hp_id.replace(':', '_').replace('ORPHA', 'Orphanet')
            g.add((disease[hp_id], RDF.type, relationship['SIO_010299']))
            for gene in l.split('\t')[2:]:
                g.add((disease[hp_id], relationship['SIO_000983'], gene_symbol[gene]))
        else:
            hp_id = hp_id.replace(':', '_')
            g.add((phenotype[hp_id], RDF.type, relationship['SIO_010056']))
            for gene in l.split('\t')[2:]:
                g.add((phenotype[hp_id], relationship['SIO_000983'], gene_symbol[gene.strip()]))

In [None]:
g.serialize(format="turtle", destination="Text_Mining_GWAS.ttl")