# Create training triples from the SNOMED ontology

## 0 | Setup

In [None]:
import pandas as pd
snomed_dir = '---------' # Point this to the SNOMED source files

concepts = pd.read_csv(snomed_dir + 'sct2_Concept_MONOSnapshot_GB_20240410.txt', sep='\t')
description = pd.read_csv(snomed_dir + 'sct2_Description_MONOSnapshot-en_GP_20240410.txt', sep='\t')
relationship = pd.read_csv(snomed_dir + 'sct2_Relationship_MONOSnapshot_GB_20240410.txt', sep='\t')

In [None]:
# Create dict to map SNOMED IDs to text terms (only for active concepts)
description = description[ description['active']==1 ]
id_to_concept = dict( zip( description['conceptID'], description['term'] ) )

## 1 | Get transitive closure

The transitive closure file is useful for finding all entities of a given type, and therefore filtering the entire input set.

Transitive closure relations aren't included in SNOMED distributions. Download the transitive closure script from https://confluence.ihtsdotools.org/display/DOCRELFMT/Transitive+closure+file and run it against the SNOMED files before continuing.

In [None]:
# Read in output of the transitive closure script
transitive = pd.read_csv('transitive/transitive.txt', sep='\t', header=None, names=['head','tail'])

# Categories of entity to include in training:
# Anything that has an IS A relationship to one of these entities (directly or transitively) will be included in training
inclusion_categories = [
    123037004,
    260787004,
    404684003,
    272379006,
    71388002,
    373873005,
    49062001,
    105590001
]
print('Including ', [id_to_concept[i] for i in inclusion_categories])

# Find all the children of these categories
transitive_children = transitive[ transitive['tail'].isin(inclusion_categories) ]
entities_to_include = set( transitive_children['head'].append( transitive_children['tail'] ) )
print(len(entities_to_include),' child entities found')

In [None]:
transitive_out = pd.DataFrame(
    data = {
        'head' : transitive['head'],
        'relation' : id_to_concept[116680003],
        'tail' : transitive['tail']
    }
)

transitive_out

In [None]:
transitive_out_selected = transitive_out[ 
    transitive_out['head'].isin(entities_to_include) 
    | transitive_out['tail'].isin(entities_to_include)
]

print(len(transitive_out))
print(len(transitive_out_selected))

transitive_out_selected.to_csv('out/triples_transitive.txt', sep='\t', header=None, index=None)

In [None]:
transitive_named = pd.DataFrame(
    data = {
        'head' : [id_to_concept[h] for h in transitive['head']],
        'tail' : [id_to_concept[t] for t in transitive['tail']]
    }
)

transitive_children_named = pd.DataFrame(
    data = {
        'head' : [id_to_concept[h] for h in transitive_children['head']],
        'tail' : [id_to_concept[t] for t in transitive_children['tail']]
    }
)
transitive_children_named['tail'].value_counts()

In [None]:
# Reshape transitive relations to a dataframe

transitive_relations_df = pd.DataFrame(
    data = {
        'sourceID' : transitive['head'],
        'typeId' : 11668003,
        'destinationId' : transitive['tail']
    }
)
transitive_relations_df.head()


## 2 | Filter the set of triples

In [None]:
# Filter the source SNOMED table to only contain entities in the selected categories
print('Source SNOMED table has ',len(relationship),' relationships')
filtered_relationship = relationship[ relationship['active']==1 ][ ['sourceId','typeId','destinationID'] ]
filtered_relationship = filtered_relationship[
    filtered_relationship['sourceId'].isin(entities_to_include)
    & filtered_relationship['destinationId'].isin(entities_to_include)
]

# Apply the same to the transitive closure file
print('Transitive closure table table has ',len(transitive_relations_df),' relationships')
filtered_transitive_relations_df = transitive_relations_df[
    transitive_relations_df['sourceId'].isin(entities_to_include)
    & transitive_relations_df['destinationId'].isin(entities_to_include)
]

# Merge the two lists
all_relationships = filtered_relationship
all_relationships['typeId'] = [id_to_concept[i] for i in all_relationships['typeId']]
all_relationships

In [None]:
all_relationships['typeId'].value_counts()

In [None]:
# Exclude bidirectional relations that shouldn't have an inverse
exclusion_relations = ['Associated with (attribute)', 'Temporally related to']
all_relationships_to_invert = all_relationships[ ~ all_relationships['typeId'].isin(exclusion_relations) ]

In [None]:
inverse_relationships = pd.DataFrame(data = {
    'sourceId': all_relationships_to_invert['destinationId'],
    'typeId' : 'INVERSE_'+all_relationships_to_invert['typeId'],
    'destinationID' : all_relationships_to_invert['sourceId']
})
inverse_relationships

In [None]:
all_relationships_and_inverse = all_relationships.append(inverse_relationships)

print('Removed ', sum(all_relationships_and_inverse.duplicated()),' duplicates')
all_relationships_and_inverse = all_relationships_and_inverse[  ~ all_relationships_and_inverse.duplicated() ]

print('Final length: ', len(all_relationships_and_inverse))

## 3 | Output the triples 

In [None]:
triples = all_relationships_and_inverse

# Case 1: all original triples
case1 = triples
case1.to_csv('out/triples_case1.txt', sep='\t', header=None, index=None)

# Case 1: all original triples, plus inverse relations
case2 = all_relationships
case2.to_csv('out/triples_case2/txt', sep='\t', header=None, index=None)