### Goals:
1. Extract the BioPortal Ethnicity Ontology
2. Process the data
3. Extract the SNOMED Ethnic Group Ontology 
4. Process the data
5. Combine the ontologies
6. Compare the gold standard labeled data to the combined ontology

#### BioPortal Ethnicity Ontology
This ethnicity ontology was downloaded as an \*\.owl file from [BioPortal](https://bioportal.bioontology.org/ontologies/EO/?p=summary) on ~Jan 6, 2022 and as there was errors in how it was saved by BioPortal, it was not able to be opened by [OwlReady2](https://pypi.org/project/Owlready2/) so I copied the raw text from the \*\.owl file to a \*\.txt file and processed it here.

In [1]:
import re
with open("ethnicity ontology in .txt", "r") as f:
    raw_ontology = f.read()
f.close()

In [2]:
#replace this error
raw_ontology = re.sub("Born_in_Democratic_People&apos;","",raw_ontology)
#all of the ethnicities are between these two character sets described by this regular expression
search_pattern = re.compile(r'"#.+"|#.+<')
raw_ontology_list = search_pattern.findall(raw_ontology)


In [3]:
len(raw_ontology_list)

5324

In [4]:
#This is to remove the identifying character sets
replace_pattern = re.compile(r'"#|<|#|"')

#this time we only want lowercase ethnicity/nationality data and strings less than 50 characters
raw_ontology_list = [replace_pattern.sub("",ont).lower() for ont in raw_ontology_list if len(ont)<50]

In [5]:
raw_ontology_list[:50]

['africa',
 'main_language_spoken',
 'asia',
 'main_language_spoken',
 'australaisa',
 'main_language_spoken',
 'europe',
 'main_language_spoken',
 'first_language_not_english',
 'main_language_spoken',
 'language_spoken_nos',
 'main_language_spoken',
 'main_language_spoken',
 'main_language_spoken',
 'language',
 'north_america',
 'main_language_spoken',
 'south_america',
 'main_language_spoken',
 'main_language_spoken',
 'born_in_france',
 'born_in_france',
 'european_country_of_birth',
 'hassnomedcode',
 'born_in_france',
 'hasctv3code',
 'born_in_france',
 'reads_german',
 'reads_german',
 'european_languages',
 'hasread2code',
 'reads_german',
 'born_in_hungary',
 'born_in_hungary',
 'european_country_of_birth',
 'hassnomedcode',
 'born_in_hungary',
 'hasctv3code',
 'born_in_hungary',
 'born_in_brunei',
 'born_in_brunei',
 'asian_country_of_birth',
 'hassnomedcode',
 'born_in_brunei',
 'hasctv3code',
 'born_in_brunei',
 'kinyarwanda',
 'kinyarwanda',
 'africa',
 'hasread2code']

As you can see there are a lot of things that we need to remove/replace, so will process them into a dictionary with counts because many errata are repeated often so they will likely show up in a frequency ordered list.

In [6]:
len(raw_ontology_list)

5322

In [7]:
raw_ontology_dict = {}
for ont in raw_ontology_list:
    if r"_"in ont:
        ont = re.sub(r'_',r' ',ont)
    if ont in raw_ontology_dict.keys():
        raw_ontology_dict[ont]+=1
    else:
        raw_ontology_dict[ont]=0

In [8]:
from collections import OrderedDict
def top_counts(dictionary,num_hits=100,head = True):
    return OrderedDict(sorted(dictionary.items(), key=lambda x: x[1], reverse=head)[:num_hits])

In [9]:
top_entries = top_counts(raw_ontology_dict)

In [10]:
top_entries

OrderedDict([('hassnomedcode', 634),
             ('hasctv3code', 627),
             ('hasread2code', 294),
             ('asia', 68),
             ('asian country of birth', 59),
             ('african country of birth', 53),
             ('european country of birth', 52),
             ('europe', 51),
             ('asian nationality', 51),
             ('european nationality', 43),
             ('the americas', 42),
             ('african nationality', 42),
             ('australasia', 34),
             ('africa', 31),
             ('american country of birth', 30),
             ('asian languages', 20),
             ('european languages', 16),
             ('atlantic country of birth', 15),
             ('pacific country of birth', 13),
             ('country of birth', 11),
             ('main language spoken', 10),
             ('nationality', 10),
             ('asian/asian asian british', 10),
             ('australaisa', 9),
             ('other black black asian origin', 9),
  

In [11]:
#set will do automatic duplicate filtering
final_ontology = set()
for ont in raw_ontology_dict.keys():
    add = True
    if ont in {"hassnomedcode","hasctv3code","hasread2code","country of birth","nationality","ethnic group","or","no"}:
        add=False
    for subword in ["language","read"]:
        if subword in ont:
            add=False
            break
    if add:
        for remove_subword in ["asian/asian","mixed/mixed",r"^or "]:
            if remove_subword in ont:
                ont = re.sub(remove_subword,"",ont).strip()
                final_ontology.add(ont)
            else:
                final_ontology.add(ont)
        for subword in ["ethnicity","nationality","origin","born in","the","other","country of birth"]:
            if subword in ont:
                extra_ont = re.sub(subword,"",ont).strip()
                final_ontology.add(extra_ont)

In [12]:
final_ontology

{'',
 'abkhazian',
 'abyssinians',
 'admiralty islanders',
 'afar',
 'afghanistan',
 'africa',
 'african',
 'african american',
 'african country',
 'african country of birth',
 'african nationality',
 'afrikaans',
 'ainu',
 'akan',
 'alacaluf',
 'albania',
 'albanian',
 'albanian nationality',
 'aleuts',
 'algeria',
 'american',
 'american country of birth',
 'american indian',
 'americas',
 'amharic',
 'andamanese',
 'andorra',
 'angola',
 'antigua and barbuda',
 'apache',
 'arab',
 'arabic',
 'aragonese',
 'argentina',
 'armenia',
 'armenian',
 'armenian nationality',
 'asia',
 'asian',
 'asian british',
 'asian country of birth',
 'asian nationality',
 'asian origin',
 'assamese',
 'atacamenos',
 'athabascans',
 'atlantic',
 'atlantic country of birth',
 'australaisa',
 'australasia',
 'australasian',
 'australasian country of birth',
 'australia',
 'australian abe',
 'australian aborigine',
 'austria',
 'austrian',
 'aymara',
 'aymara nationality',
 'azerbaijan',
 'azerbaijani',
 

In [13]:
len(final_ontology)

937

In [14]:
#Save
import json
with open('BioPortal ethnicities.json', 'w') as f:
    json.dump({'ethnicities':list(final_ontology)}, f)
f.close()

#### SNOMED Ethnicity Ontology
This ethnicity ontology was downloaded as an \*\.csv file from [BioPortal](https://bioportal.bioontology.org/ontologies/SNOMED-Ethnic-Group/?p=summary) on March 7, 2022.

In [15]:
import pandas as pd
df = pd.read_csv('SNOMED-Ethnic-Group.csv')
df

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,CONCEPTSTATUS,CTV3ID,...,SNOMEDID,SY,SYNONYM FN,SYNONYM IS,SYNONYM MTH_FN,SYNONYM MTH_PT,SYNONYM OF,SYNONYM SY,TUI,UMLS_CUI
0,http://purl.bioontology.org/ontology/SNOMED-Et...,North African Arab (NMO),,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XM1SB,...,F-0085C,,North African Arab (NMO) (ethnic group),,,,,,T098,C0554961
1,http://purl.bioontology.org/ontology/SNOMED-Et...,Solomon Islanders,,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XUEyV,...,S-65230,,Solomon Islanders (ethnic group),,,,,,T098,C0337932
2,http://purl.bioontology.org/ontology/SNOMED-Et...,Naiars,,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XULbS,...,S-66060,,Naiars (ethnic group),,,,,,T098,C0337943
3,http://purl.bioontology.org/ontology/SNOMED-Et...,Czechs,,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XUB0u,...,S-61070,,Czechs (ethnic group),,,,,,T098,C0337799
4,http://purl.bioontology.org/ontology/SNOMED-Et...,Egyptians,,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XU8qN,...,S-61100,,Egyptians (ethnic group),,,,,,T098,C0337801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,http://purl.bioontology.org/ontology/SNOMED-Et...,Other Asian (NMO),,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,9SA8.,...,F-00198,Other Asian (NMO) (Id: 154190006),Other Asian (NMO) (ethnic group),,,,,,T098,C0422793
257,http://purl.bioontology.org/ontology/SNOMED-Et...,Admiralty Islanders,,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XUARe,...,S-65010,,Admiralty Islanders (ethnic group),,,,,,T098,C0337913
258,http://purl.bioontology.org/ontology/SNOMED-Et...,Senoy,,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XUFOW,...,S-64170,,Senoy (ethnic group),,,,,,T098,C0337906
259,http://purl.bioontology.org/ontology/SNOMED-Et...,Athabascans,,,False,,,http://purl.bioontology.org/ontology/SNOMED-Et...,0,XUCuT,...,S-63050,,Athabascans (ethnic group),,,,,,T098,C0337857


In [16]:
df_ethnicities = set(df['Preferred Label'])
df_ethnicities

{'Abyssinians (Amharas)',
 'Admiralty Islanders',
 'African American',
 'Afro-Caribbean',
 'Afro-Caucasian',
 'Ainu',
 'Alacaluf',
 'Aleuts',
 'Amerind',
 'Andamanese',
 'Apache',
 'Arabs',
 'Armenians',
 'Asian - ethnic group',
 'Atacamenos',
 'Athabascans',
 'Australian Aborigines',
 'Austrians',
 'Aymara',
 'Aztec',
 'Badagas',
 'Bangladeshi',
 'Bantu',
 'Barundi',
 'Basques',
 'Batutsi',
 'Belgians',
 'Bhutanese',
 'Black - ethnic group',
 'Black - other African country',
 'Black - other Asian',
 'Black - other, mixed',
 'Black African',
 'Black African and White',
 'Black Arab',
 'Black British',
 'Black Caribbean',
 'Black Caribbean and White',
 'Black Caribbean/W.I./Guyana',
 'Black East African Asian',
 'Black East African Asian/Indo-Caribbean',
 'Black Guyana',
 'Black Indian sub-continent',
 'Black Indo-Caribbean',
 'Black Iranian',
 'Black Jews',
 'Black N African/Arab/Iranian',
 'Black North African',
 'Black West Indian',
 'Black, other, non-mixed origin',
 'Blackfeet',
 '

In [17]:
snomed_ethnicities = set()
for ethn in df_ethnicities:
    if "(NMO)" in ethn:
        extra_ethn = re.sub("(NMO)","",ethn).strip()
        snomed_ethnicities.add(extra_ethn)
    
    if ' - ethnic group' in ethn:
        extra_ethn = re.sub(' - ethnic group',"",ethn).strip()
        snomed_ethnicities.add(extra_ethn)
        
    if 'N ' in ethn:
        extra_ethn = re.sub('N ',"North ",ethn).strip()
        snomed_ethnicities.add(extra_ethn)
        
    if '-' in ethn:
        extra_ethn = re.sub('-'," ",ethn).strip()
        snomed_ethnicities.add(extra_ethn)

snomed_ethnicities = snomed_ethnicities.union(df_ethnicities)
snomed_ethnicities = {ethn.lower() for ethn in snomed_ethnicities}
len(snomed_ethnicities)

309

In [18]:
#Save
with open('SNOMED ethnicities.json', 'w') as f:
    json.dump({'ethnicities':list(snomed_ethnicities)}, f)
f.close()

#### Combine the Ontologies

In [19]:
#Re-open ethnicities
import json
with open('BioPortal ethnicities.json', 'r') as f:
    data = json.load(f)
    bioportal_ethnicities = {ethnicity for ethnicity in data['ethnicities']}
f.close()

In [20]:
#Re-open ethnicities
import json
with open('SNOMED ethnicities.json', 'r') as f:
    data = json.load(f)
    snomed_ethnicities = {ethnicity for ethnicity in data['ethnicities']}
f.close()

In [21]:
ethnicity_ontologies = bioportal_ethnicities.union(snomed_ethnicities)

In [22]:
import csv
tokens, labels= [],[]
with open("epi_gold_set.tsv", 'r') as f:
    reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
    tokens = [row[0] for row in reader if len(row)>1]
    f.seek(0)
    labels = [row[1] for row in reader if len(row)>1]
f.close()

In [23]:
len(tokens),len(labels)

(159141, 159141)

In [24]:
ethnicity_phrases = []
for i in range(len(tokens)):
    if labels[i]=='B-ETHN':
        j=1
        while True:
            if labels[i+j]!='I-ETHN':
                break
            else:
                j+=1 
        if j>1:
            phrase = ' '.join(tokens[i:i+j])
        else:
            phrase = tokens[i]
        
        phrase = re.sub(r' - ',r'-',phrase)
        ethnicity_phrases.append(phrase.lower())

In [25]:
ethnicity_phrases

['african american',
 'afro-caribbean',
 'african american',
 'afro-caribbean',
 'african americans',
 'afro-caribbean',
 'iranian',
 'chinese',
 'chinese',
 'chinese',
 'korean',
 'japanese',
 'japanese',
 'japanese',
 'malaysian orang asli',
 'hong kong chinese',
 'arab',
 'arab',
 'swedish',
 'european',
 'swedish',
 'black',
 'mexican',
 'iranian',
 'european',
 'moroccan',
 'moroccan',
 'american',
 'european',
 'lebanese',
 'danish',
 'german',
 'german',
 'german',
 'caucasian',
 'quebec',
 'japanese',
 'japanese',
 'north american',
 'north american',
 'turkish',
 'arab',
 'qataris',
 'arab',
 'arab',
 'qataris',
 'arab',
 'finnish',
 'finnish',
 'qatari',
 'qatari',
 'qatari',
 'qatari',
 'portuguese',
 'asian',
 'iranian',
 'brazilian',
 'brazilian',
 'european',
 'asian',
 'chinese',
 'indian',
 'indian',
 'indian',
 'indian',
 'indian',
 'indian',
 'israeli',
 'norwegian',
 'black',
 'white',
 'black',
 'white',
 'black',
 'black',
 'brazilian',
 'xikrin tribe',
 'kayapo',


In [26]:
yes = 0
for labeled_ethnicity in ethnicity_phrases:
    if labeled_ethnicity in ethnicity_ontologies:
        yes+=1
    else:
        print(labeled_ethnicity)
print("Total coverage: ",yes/len(ethnicity_phrases))

african americans
malaysian orang asli
hong kong chinese
mexican
lebanese
quebec
qataris
qataris
qatari
qatari
qatari
qatari
brazilian
brazilian
brazilian
xikrin tribe
kayapo
kayapo
brazilian
brazilian
indians
libyan
caucasians
asians
africans
turkish population
french-canadian population
sicilian
sicilian
burundian
non-hispanic whites
africans
europeans
chilean
greenlandic
greenland
canadian manitoba mennonite
manitoba mennonite
nepalese
nepalese
brazilian
chilean
mestizos
australian
iñupiaq
yupik
roma
roma
roma
roma
roma
roma
roma
roma
roma
Total coverage:  0.7429906542056075
