In [23]:
# %load_ext autoreload
%reload_ext autoreload
%autoreload 2

### Preparing BERT Model to used to convert terms to embeddings

In [5]:
# Define BERT Model to convert sntence to embeddings
from sentence_transformers import SentenceTransformer, util

BiobertModel = 'gsarti/biobert-nli'
biobert_model = SentenceTransformer(BiobertModel)



### Creating Adan object

In [24]:
from fc_adan_alignment import Adan

adan = Adan(biobert_model)

In [25]:
# loading and generating embedding and textual features
# empty parameter will load 'resources/adan/adavn_vessels_with_connections.csv'
# adan.create_adan_index(adan_file)

adan.create_adan_index()

Batches: 100%|██████████| 29/29 [00:05<00:00,  5.46it/s]
100%|██████████| 921/921 [01:44<00:00,  8.84it/s]
Batches: 100%|██████████| 39/39 [00:07<00:00,  5.53it/s]


### Aligning FC Artery to ADAN

In [10]:
import json
artery_file = 'resources/fc/arteries.json'
with open(artery_file, 'r') as f:
    arteries = json.load(f)

In [11]:
for artery_name, artery_meta in arteries.items():
    parent_name = artery_meta[0]['Parent artery']
    if parent_name in arteries:
        arteries[parent_name][0]['Children'] = arteries[parent_name][0].get('Children', []) + [artery_name]

In [34]:
import pandas as pd

# arrange to a similar format as adan
formatted_arteries = {'id':[], 'name':[], 'parent':[], 'children':[], 'ontology id':[]}
for artery_name, artery_meta in arteries.items():
    for child in artery_meta[0].get('Children', []):
        formatted_arteries['id'] += [str(len(formatted_arteries['id']))]
        formatted_arteries['name'] += [artery_name]
        formatted_arteries['parent'] += [artery_meta[0]['Parent artery']]
        formatted_arteries['children'] += [child]
        formatted_arteries['ontology id'] += [artery_meta[0]['Ontology ID']]
    if len(artery_meta[0].get('Children', [])) == 0:
        formatted_arteries['id'] += [str(len(formatted_arteries['id']))]
        formatted_arteries['name'] += [artery_name]
        formatted_arteries['parent'] += [artery_meta[0]['Parent artery']]
        formatted_arteries['children'] += [None]
        formatted_arteries['ontology id'] += [artery_meta[0]['Ontology ID']]

df_artery = pd.DataFrame(formatted_arteries)

In [49]:
## Try to align based on name and parent only

from tqdm import tqdm

allign_columns = ['name', 'parent', 'ontology id', 'adan (name, parent)']
df_alligned = pd.DataFrame(columns=allign_columns)
df = df_artery[['name', 'parent', 'ontology id']].drop_duplicates(ignore_index=True)
for index, row in tqdm(df.reset_index().iterrows()):
    # get candidate
    if row['parent'] != 'NA':
        candidates = adan.get_similar_adan_name_parent((row['name'], row['parent']))
    else:
        candidates = adan.get_similar_adan_term((row['name']))
        new_candidates = []
        for c in candidates:
            name = c[0]
            parent = adan.get_parent_name(name)
            new_candidates += [((name, parent), c[1])]
        candidates = new_candidates
    # remove other candidate when confident with the first one
    if candidates[0][1] >=0.99:
        candidates = (candidates[0],)
    row['adan (name, parent)'] = candidates
    df_alligned.loc[len(df_alligned.index)] = row
    
    

255it [00:50,  5.06it/s]


In [51]:
## Completing the dataframe with ADAN's name, parent, and confidence level

data = df_alligned.explode('adan (name, parent)')
data = data.drop_duplicates()
data = data.rename(columns={'name':'FC name', 'parent':'FC parent', 'ontology id':'FC ontology'})
def get_adan_name(x):
    return x[0][0]
data['ADAN name'] = data['adan (name, parent)'].apply(lambda x: get_adan_name(x))
def get_adan_parent(x):
    return x[0][1]
data['ADAN parent'] = data['adan (name, parent)'].apply(lambda x: get_adan_parent(x))
def get_adan_confidence(x):
    return x[1]
data['Confidence'] = data['adan (name, parent)'].apply(lambda x: get_adan_confidence(x))

In [52]:
## adding some notes

data['Note'] = ''
def _get_note(row):
    if row['Confidence'] >= 0.99:
        name_fc = adan.get_similar_adan_term(row['FC name'], top_k=1)[0]
        name_adan = adan.get_similar_adan_term(row['ADAN name'], top_k=1)[0]
        if name_fc[0] == name_adan[0]:
            note = 'align'
        else:
            parent_fc = adan.get_similar_adan_term(row['FC parent'], top_k=1)[0]
            parent_adan = adan.get_similar_adan_term(row['ADAN parent'], top_k=1)[0]
            if parent_fc[0] == parent_adan[0]:
                note = 'align'
            else:
                note = 'need to check parent-child relation'
        return note
    return ''
data['Note'] = data.apply(_get_note, axis=1)

In [54]:
## Formatting the generated results and store it in a FC_ADAN.csv file in output folder


df_out = pd.DataFrame(columns=['ID', 'FC name', 'FC parent', 'FC ontology', 'ADAN name', 'ADAN parent', 'Confidence', 'Note'])
prev_row = {'FC name':None, 'FC parent':None}
for idx, row in data.iterrows():
    if row['FC name'] != prev_row['FC name'] or row['FC parent'] != prev_row['FC parent']:
        prev_row['FC name'] = row['FC name']
        prev_row['FC parent'] = row['FC parent']
        df_out.loc[len(df_out.index)] = [idx, row['FC name'], row['FC parent'], row['FC ontology'], row['ADAN name'], row['ADAN parent'], row['Confidence'], row['Note']]
    else:
        df_out.loc[len(df_out.index)] = [idx, '', '', '', row['ADAN name'], row['ADAN parent'], row['Confidence'], row['Note']]

df_out.to_csv('output/FC_ADAN.csv', index=False)