In [113]:
import requests
import simplejson as json
import pandas as pd
import numpy as np
import os
import json
import math

In [3]:
notebook_path = os.path.abspath("OLS matching.ipynb")
config_path = os.path.join(os.path.dirname(notebook_path), "Data/config0.json")

In [41]:
with open(config_path) as config_file:
    config= json.load(config_file)
config

{'asctb_sid': '1tK916JyG5ZSXW_cXfsyZnzXfjyoN-8B2GXLbYD6_vF0',
 'references': [{'name': 'lung',
   'url': 'https://hubmapconsortium.github.io/asctb-azimuth-data-comparison/lung.csv',
   'organ_name': 'lung',
   'asctb_sheet_name': 'Lung_v1.1'},
  {'name': 'pancreas',
   'url': 'https://hubmapconsortium.github.io/asctb-azimuth-data-comparison/pancreas.csv',
   'organ_name': 'pancreas',
   'asctb_sheet_name': 'Pancreas_v1.0'},
  {'name': 'kidney',
   'url': 'https://hubmapconsortium.github.io/asctb-azimuth-data-comparison/kidney.csv',
   'organ_name': 'kidney',
   'asctb_sheet_name': 'Kidney_v1.1'},
  {'name': 'brain',
   'url': 'https://hubmapconsortium.github.io/asctb-azimuth-data-comparison/motor_cortex.csv',
   'organ_name': 'brain',
   'asctb_sheet_name': 'Brain_v1.1'},
  {'name': 'bone_marrow',
   'url': 'https://hubmapconsortium.github.io/asctb-azimuth-data-comparison/bone_marrow.csv',
   'organ_name': 'bone_marrow',
   'asctb_sheet_name': 'Bone_Marrow_v1.1'},
  {'name': 'blood_pmb

In [51]:
asctb_sheet_id = config["asctb_sid"]

In [53]:
# Fetch Azimuth Data
def fetch_azimuth(az_url):
    azimuth_df= pd.read_csv (az_url,skiprows=10)
    azimuth_all_cts=[]
    azimuth_all_label=[]
    azimuth_ct = azimuth_df.filter(regex=("ID"))
    azimuth_label = azimuth_df.filter(regex=("AS/[0-9]$"))
    for col in azimuth_ct:
        azimuth_all_cts.extend(azimuth_ct[col].tolist())
    azimuth_all_cts=pd.DataFrame(azimuth_all_cts)
    azimuth_all_cts.rename(columns = {0:"CT/ID"},inplace = True)
    for col in azimuth_label:
        azimuth_all_label.extend(azimuth_label[col].tolist())
    azimuth_all_label=pd.DataFrame(azimuth_all_label)
    azimuth_all_label.rename(columns = {0:"CT/LABEL"},inplace = True)
    
    azimuth_all_cts_label=pd.concat([azimuth_all_cts,azimuth_all_label],axis=1)
    azimuth_all_cts_label_unique=azimuth_all_cts_label.drop_duplicates()
    azimuth_all_cts_label_unique.reset_index(drop=True, inplace=True)
    
    return azimuth_all_cts_label,azimuth_all_cts_label_unique
    

In [93]:
# Fetch Asctb Data
def fetch_asctb(sheet_id,asctb_sheet_name):
    
    asctb_df = pd.read_csv(f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={asctb_sheet_name}",skiprows=3) 
    #print(f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={asctb_sheet_name}")
    
    asctb_ct = asctb_df.filter(regex=("^CT.*ID$"))
    asctb_label = asctb_df.filter(regex=("CT/[0-9]$"))
    asctb_all_cts=[]
    asctb_all_label=[]

    for col in asctb_ct:
        asctb_all_cts.extend(asctb_ct[col].tolist())
    asctb_all_cts=pd.DataFrame(asctb_all_cts)
    asctb_all_cts.rename(columns = {0:"CT/ID"},inplace = True)

    for col in asctb_label:
        asctb_all_label.extend(asctb_label[col].tolist())
    asctb_all_label=pd.DataFrame(asctb_all_label)
    asctb_all_label.rename(columns = {0:"CT/LABEL"},inplace = True)
    
    asctb_all_cts_label=pd.concat([asctb_all_cts,asctb_all_label],axis=1)
    asctb_all_cts_label_unique=asctb_all_cts_label.drop_duplicates()
    asctb_all_cts_label_unique.reset_index(drop=True, inplace=True)
    
    return asctb_all_cts_label,asctb_all_cts_label_unique

In [209]:
# Find incorrect azimuth CT
def incorrect_ct(azimuth_all_cts_label):
    inc_cl=[]
    az_cts=[i[0] for i in azimuth_all_cts_label.filter(regex=("^CT.*ID$")).values.tolist()]

    
    for i in range(len(az_cts)):
        if str(az_cts[i])[:3]!="CL:":# and str(az_cts[i])[:6]!="UBERON":
            inc_cl.append(i)

    az_incorrect_ct_id=azimuth_all_cts_label.loc[inc_cl]
    az_incorrect_ct_id=az_incorrect_ct_id.drop_duplicates()
    az_incorrect_ct_id.reset_index(drop=True, inplace=True)
    
    return az_incorrect_ct_id

In [229]:
def check_in_asctb(cl_az,i,asctb_kidney_all_cts_label_unique,az_row_all,asctb_row_all,not_matching_all):    
    flag=0
    for j in range(len(asctb_kidney_all_cts_label_unique['CT/ID'])):
        if cl_az == asctb_kidney_all_cts_label_unique['CT/ID'][j]:
            az_row_all.append(i)
            asctb_row_all.append(j)
            flag=1
            #print(cl_az,asctb_kidney_all_cts_label_unique['CT/ID'][j])
    if flag==0:
        not_matching_all.append(i)

In [251]:
def check_in_az(cl_asctb,i,az_kidney_all_cts_label_unique,az_row,asctb_row,not_matching):    
    flag=0
    for j in range(len(az_kidney_all_cts_label_unique['CT/ID'])):
        if cl_asctb == az_kidney_all_cts_label_unique['CT/ID'][j]:
            az_row.append(j)
            asctb_row.append(i)
            flag=1
            break
    if flag==0:
        not_matching.append(i)

In [289]:
def perfect_match_for_azimuthct_in_asctb(azimuth_all_cts_label_unique,asctb_all_cts_label_unique):
    az_row_all=[]
    asctb_row_all=[]
    not_matching_all=[]

    for i in range(len(azimuth_all_cts_label_unique['CT/ID'])):  
        if type(azimuth_all_cts_label_unique['CT/ID'][i])!=np.float64 and type(azimuth_all_cts_label_unique['CT/ID'][i])!=float and azimuth_all_cts_label_unique['CT/ID'][i][:3]=="CL:":
            check_in_asctb(azimuth_all_cts_label_unique['CT/ID'][i],i,asctb_all_cts_label_unique,az_row_all,asctb_row_all,not_matching_all)
        else:
            not_matching_all.append(i)
    
    az_matches_all=azimuth_all_cts_label_unique.loc[az_row_all]
    asctb_matches_all=asctb_all_cts_label_unique.loc[asctb_row_all]

    az_matches_all.reset_index(drop=True,inplace=True)
    asctb_matches_all.reset_index(drop=True,inplace=True)
    
    az_matches_all.rename(columns = {"CT/ID":"AZ.CT/ID","CT/LABEL":"AZ.CT/LABEL"},inplace = True)
    asctb_matches_all.rename(columns = {"CT/ID":"ASCTB.CT/ID","CT/LABEL":"ASCTB.CT/LABEL"},inplace = True)

    perfect_matches_all=pd.concat([az_matches_all,asctb_matches_all],axis=1)
    perfect_matches_all=perfect_matches_all.drop_duplicates()
    perfect_matches_all.reset_index(drop=True, inplace=True)
    
    az_mismatches_all=azimuth_all_cts_label_unique.loc[not_matching_all]
    az_mismatches_all=az_mismatches_all.drop_duplicates()
    az_mismatches_all.reset_index(drop=True, inplace=True)
    
    return perfect_matches_all,az_mismatches_all

In [290]:
def perfect_match_for_asctbct_in_azimuth(azimuth_all_cts_label_unique,asctb_kidney_all_cts_label_unique):
    az_row=[]
    asctb_row=[]
    not_matching=[]

    for i in range(len(asctb_kidney_all_cts_label_unique['CT/ID'])):
        if type(asctb_kidney_all_cts_label_unique['CT/ID'][i])!=np.float64 and type(asctb_kidney_all_cts_label_unique['CT/ID'][i])!=float and asctb_kidney_all_cts_label_unique['CT/ID'][i][:3]=="CL:":
            check_in_az(asctb_kidney_all_cts_label_unique['CT/ID'][i],i,azimuth_all_cts_label_unique,az_row,asctb_row,not_matching)
        else:
            not_matching.append(i)

    az_matches=azimuth_all_cts_label_unique.loc[az_row]
    asctb_matches=asctb_kidney_all_cts_label_unique.loc[asctb_row]

    az_matches.reset_index(drop=True,inplace=True)
    asctb_matches.reset_index(drop=True,inplace=True)

    az_matches.rename(columns = {"CT/ID":"AZ.CT/ID","CT/LABEL":"AZ.CT/LABEL"},inplace = True)
    asctb_matches.rename(columns = {"CT/ID":"ASCTB.CT/ID","CT/LABEL":"ASCTB.CT/LABEL"},inplace = True)

    perfect_matches=pd.concat([asctb_matches,az_matches],axis=1)

    asctb_mismatches=asctb_kidney_all_cts_label_unique.loc[not_matching]
    asctb_mismatches.reset_index(drop=True,inplace=True)
    
    return asctb_mismatches
    

In [304]:
def incorrect_cts_ebi(mismatches):
    found_in_ols=[]
    not_found_in_ols=[]
    
    for i in range(len(mismatches['CT/ID'])):
        if type(mismatches['CT/ID'][i])!=np.float64 and type(mismatches['CT/ID'][i])!=float:
            cl_az=mismatches['CT/ID'][i].replace(":","_")
            url = "http://www.ebi.ac.uk/ols/api/ontologies/cl/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2F"
            payload={}
            headers = {
                  'Accept': 'application/json'
                }
            response = requests.request("GET", url+cl_az, headers=headers, data=payload)

            if response.status_code!=200:
                not_found_in_ols.append(i)
            else:
                found_in_ols.append(i)
        else:
            not_found_in_ols.append(i)

    
    az_not_found_in_ols=mismatches.loc[not_found_in_ols]
    az_not_found_in_ols.reset_index(drop=True,inplace=True)

    return az_not_found_in_ols

In [305]:
# parse config file
ct=0
for ref in config['references']:
    name= ref['name']
    asctb_sheet_name = ref['asctb_sheet_name']
    az_url= ref['url']
    
    # Fetch Azimuth data
    azimuth_all_cts_label,azimuth_all_cts_label_unique = fetch_azimuth(az_url)
    
    # Fetch ASCTB data
    asctb_all_cts_label,asctb_all_cts_label_unique = fetch_asctb(asctb_sheet_id,asctb_sheet_name)

    # Number of Azimuth cts without IDs
    azimuth_missing_cts=azimuth_all_cts_label_unique[azimuth_all_cts_label_unique['CT/ID'].isna() & ~azimuth_all_cts_label_unique['CT/LABEL'].isna()].reset_index(drop=True)

    # Number of ASCTB cts without IDs
    asctb_missing_cts=asctb_all_cts_label_unique[asctb_all_cts_label_unique['CT/ID'].isna() & ~asctb_all_cts_label_unique['CT/LABEL'].isna()].reset_index(drop=True)

    # Incorrect CT ID in Azimuth
    azimuth_incorrect_ct_ids=incorrect_ct(azimuth_all_cts_label)
    
    # Incorrect CT ID in Asctb
    asctb_incorrect_ct_ids=incorrect_ct(asctb_all_cts_label)

    # Perfect Match and Mismatch for Azimuth CT in ASCTB (AZ - ASCTB)
    azimuth_perfect_matches,azimuth_mismatches=perfect_match_for_azimuthct_in_asctb(azimuth_all_cts_label_unique,asctb_all_cts_label_unique)
    
#     print("Matches",len(azimuth_perfect_matches))
#     print("Mismatches",len(azimuth_mismatches))
#     print("Total",len(azimuth_all_cts_label_unique))
    
    # Perfect Match and Mismatch for ASCTB CT in Azimuth (ASCTB - Azimuth)
    asctb_mismatches=perfect_match_for_asctbct_in_azimuth(azimuth_all_cts_label_unique,asctb_all_cts_label_unique)
    
#     print("Matches",len(azimuth_perfect_matches))
#     print("Mismatches",len(asctb_mismatches))
#     print("Total",len(asctb_all_cts_label_unique))


    # Incorrect CT ID in Azimuth (EBI)
    incorrect_ct_azimuth_ebi=incorrect_cts_ebi(azimuth_mismatches)
    
    
    # Incorrect CT ID in Asctb (EBI)
    incorrect_ct_asctb_ebi=incorrect_cts_ebi(asctb_mismatches)

    
    ct+=1
    print(name)
    if ct==:
        break
    

lung


In [307]:
incorrect_ct_asctb_ebi

Unnamed: 0,CT/ID,CT/LABEL
0,CL:1000388,brush
1,LMHA:00142,submucosal gland ciliated duct cells
2,LMHA:00087,submucosal gland basal cells
3,LMHA:00143,submucosal gland secretory
4,LMHA:00693,submucosal gland collecting duct epithelium
5,LMHA:00238,submucosal gland mucous cells
6,LMHA:00340,submucosal gland serous cells
7,LMHA:00805,venous endothelial cell
8,LMHA:00805,Pulmonary vein endothelial cell
9,,Pulmonary vein smooth muscle cell
