In [1]:
import re
import os
import glob

os.chdir(os.path.expanduser(os.path.join('~', 'python_workspace', 'medical_corpus_scripting', 'ner_assets')))

first step involves using an algorithm to tag the sequences based on curated lists
sequence tags: B-IL for illness, B-SY for symptom, B-BP for body part
symptoms and body parts overlap: body parts are found within symptoms

therefore, need to do each of three sequences separately on same document
step 1: load three curated list files as lists (done)
step 2: for each document in raw_processing, do tagging three times, 
     once for illness, then symptom, then body part
     
if overlapping duplicates in each type, need to match the longest one--it's best to sort candidate
    lists by how many spaces they contain (done)

the linking process will involve two steps:
step 1: set up linking database using MySQL through sqlalchemy
step 2: do linking of tagged elements based on the following rule:
    if illness, then link symptoms, where these do not overlap; domain is entire file
    if symptom, then link body parts, where these do overlap; domain is symptom itself

Semantic types in the UMLS (metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt):
bpoc "body part, organ, or organ component"
bact "bacterium"
dsyn "disease or syndrome"
orgm "organism"
sosy "sign or symptom"
tisu "tissue"
virs "virus"

before I begin to actually tag anything, I have separated out dsyn, virs, and bact.

In [2]:
def load_names(filename):
    '''Loads named entity candidates from curated lists.
    param filename : name of file containing the curated list
    returns names : list of strings representing named entity candidates
    Note: This returns the names as untokenized strings, as untokenized strings are
        more easily handled in Python when matching than tokenized sentences as lists.
    '''
    with open(filename, 'r') as f:
        names = [line.strip() for line in f.readlines()]
    names = sorted(names, key=lambda x: x.count(' '), reverse = True)
    return names

# load illness names
illness_names = load_names('disease_names.txt')

# load symptom names
symptom_names = load_names('symptom_names.txt')

# load body part names
bodypart_names = load_names('body_part_names.txt')

In [13]:
print(illness_names)

['TUS MOB GROUP B STREPTOCOCCAL , KIS MUS RAU LWM QHOV UAS TSIS MUAJ MOB ( Tus mob GBS , Group B Strep , Tus mob Group B streptococcal ua mob rau cov mos liab yug los tshiab )', 'Cov Kab Mob Arboviral ( arthropod - borne encephalitis , California ( La Crosse ) encephalitis , St . Louis encephalitis , Eastern equine encephalitis , Western equine encephalitis )', 'Tus Mob - Viral Hemorrhagic Fever ( Ebola Virus Hemorrhagic Fever , Marburg Virus Hemorrhagic Fever )', 'Mob Tes , Taw thiab Qhov Ncauj ( Hand , Foot and Mouth Disease ) HFMD', 'Ua Npaws Tshaj Cum - Dengue Fever ( breakbone fever , dengue hemorrhagic fever )', 'Kab Mob Npluag Paj Hlwb - Viral Meningitis ( aseptic meningitis , nonbacterial meningitis )', 'Mob Plab Zawv / Thoj Plab Cholera ( Vibrio cholerae O1 or 0139 )', 'Tus Mob - Rocky Mountain Spotted Fever ( tick - borne typhus fever )', 'Mob Plab Ntswj / Plab Zawv Liab Campylobacteriosis ( Campylobacter sp . )', 'Qaug Zog / Si Nkees Ntev Ntev Chronic Fatigue Syndrome ( CFS 

place tag at each space
use re to find matches, and use start and end locations of matches to determine 
     where to insert /B-IL or /I-IL, and insert them
if illness, there should be only one complete match, and then several partial matches
remember that matches may not align with syllable boundaries
once entire list of candidates has been checked, insert /O after all others not already with tags
then tokenize

In [3]:
# this is producing exactly the right results
# I just need to modify this to apply O labels for non-entities
# then have it save each output as a file in a specific folder
def return_ne_tags(input_sequences, candidate_names, tag_str):
    '''
    Returns input sequences with named entity tags inserted before spaces.
    param input_sequences : the input sequences to tag, as a list of strings
    param candidate_names : the candidate names to consider when tagging
    param tag_str : string of tag representing candidate type
    returns tagged_sequences : input sequences with tags inserted
    '''
    tagged_candidates = {}
    processed_sequences = []
    for item in input_sequences:
        sequence = item
        for name in candidate_names:
            name_pattern = re.compile('(?:(?<=^)|(?<= ))' + re.escape(name) + '(?:(?=$)|(?= ))', re.I)
            if name_pattern.search(item):
                if name not in tagged_candidates.keys():
                    name_list = name.split(' ')
                    for i, word in enumerate(name_list):
                        if i == 0:
                            prefix = '▹B-'
                        elif i == len(name_list) - 1:
                            prefix = '▹E-'
                        else:
                            prefix = '▹I-'
                        name_list[i] = word + prefix + tag_str
                    tagged_name = ' '.join(name_list)
                    tagged_candidates[name] = tagged_name
                sequence = name_pattern.sub(tagged_candidates[name], sequence)
        processed_sequences.append(sequence)
    return processed_sequences

In [4]:
def tag_default(input_sequences):
    output_sequences = []
    for line in input_sequences:
        tokenized_line = line.split(' ')
        processed_line = []
        for item in tokenized_line:
            if '▹' not in item:
                processed_line.append(item + '▹O')
            else:
                processed_line.append(item)
        output_sequences.append(' '.join(processed_line))
    return output_sequences

In [17]:
#print(data)
#print(illness_names)
# this process has a problem: this isn't getting them all--modify disease_names
illness_tagged = tag_default(return_ne_tags(data, illness_names, 'DSYN'))
print(illness_tagged)

['WISCONSIN▹O DIVISION▹O OF▹O PUBLIC▹O HEALTH▹O Department▹O of▹O Health▹O Services▹O Vibriosis▹B-DSYN (▹I-DSYN non▹I-DSYN -▹I-DSYN cholera▹I-DSYN )▹E-DSYN Disease▹O Fact▹O Sheet▹O Series▹O', 'Vibriosis▹B-DSYN yog▹O ib▹O hom▹O mob▹O los▹O ntawm▹O tus▹O kab▹O mob▹O Vibrio▹O ,▹O feem▹O ntau▹O yog▹O tus▹O kab▹O mob▹O Vibrio▹O parahemolyticus▹O los▹O yog▹O tus▹O kab▹O mob▹O Vibrio▹O vulnificus▹O .▹O Tus▹O kab▹O mob▹O Vibrio▹O ua▹O kom▹O raws▹O plab▹O ,▹O ua▹O mob▹O rau▹O daim▹O tawv▹O nqaij▹O ,▹O thiab▹O ua▹O mob▹O rau▹O cov▹O ntshav▹O .▹O Hom▹O kab▹O mob▹O Vibrio▹O (▹O non▹O -▹O cholera▹O )▹O yog▹O dab▹O tsi▹O ?▹O Hom▹O kab▹O mob▹O Vibrio▹O (▹O non▹O -▹O cholera▹O )▹O yog▹O cov▹O mob▹O uas▹O los▹O ntawm▹O tib▹O hom▹O kab▹O mob▹O Vibrio▹O uas▹O ua▹O kom▹O muaj▹O cov▹O mob▹O cholera▹O ,▹O tab▹O sis▹O ho▹O tsis▹O ua▹O kom▹O mob▹O cholera▹O .▹O Cov▹O kab▹O mob▹O ua▹O kom▹O mob▹O cholera▹O tsuas▹O muaj▹O V▹O .▹O cholerae▹O O1▹O thiab▹O V▹O .▹O cholerae▹O O139▹O xwb▹O .▹O Kab▹O mob▹B-DSYN Vibri

In [21]:
symptom_tagged = return_ne_tags(data, symptom_names, 'SY')

In [22]:
print(illness_tagged)
print(symptom_tagged)

['TUS▹B-IL MOB▹I-IL ENTEROTOXIGENIC▹I-IL E▹I-IL .▹I-IL COLI▹I-IL (▹I-IL ETEC▹I-IL )▹E-IL', 'Escherichia coli ( E . coli ) yog cov kab mob uas pom muaj rau ntawm tej chaw nyob ib ncig yus , hauv tej khoom noj , thiab hauv cov hnyuv ntawm tej tsiaj txhu thiab tib neeg . Hom mob E . coli feem ntau yeej tsis ua teeb meem dab tsi thiab kuj yog ib feem tseem ceeb ntawm txoj kev zom zaub mov , tab sis muaj ib txhia kuj ua rau koj mob . Enterotoxigenic E . coli ( ETEC ) yog ib hom ntawm cov kab mob E . coli uas ua tau rau koj raws plab . Tsis hais leej twg los yeej kis tau tus mob ETEC . Nws yog ib tus mob raws plab uas kheev pom muaj nyob rau cov teb chaws tseem tsis tau vam meej txaus , tshwj xeeb yog ua mob rau cov me nyuam thiab nyob neeg uas mus ncig rau cov teb chaws ntawd . Tab sis , tab txawm cov uas yeej tsis tawm lub Teb Chaws Meskas mus qhov twg li los yeej kis tau tus mob ETEC no thiab .', 'Dab tsi ua rau tau tus mob no ? Tus mob ETEC kis tau los ntawm tej khoom noj thiab dej uas p

In [81]:
# this might be better served with a machine learning approach ultimately to avoid homonyms
# for now, this is OK because it'll be processed within symptoms
# however, body parts are associated with symptoms, so I don't really need to build this
#     part of the ontology using illness text files
print(return_ne_tags(data, bodypart_names, 'BP'))

['TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 'Tus Mob Acute flaccid myelitis ( AFM ) los yog kuj muab hu ua acute flaccid paralysis with anterior myelitis los yog polio like syndrome ( cov tsos mob zoo li tus mob tuag npab tuag ceg▹B-BP ) . Nws yog ib tus mob uas tsis tshua pom muaj , feem tau yog pom muaj rau cov me nyuam , uas ua teeb meem rau cov leeg▹B-BP xa xov hauv lub cev , uas yog muab hais kom tseeb ces yog tus txha▹B-BP nqaj qaum . Tus mob AFM yog tau los ntawm ib cov kab mob ntawm cov yoov chaj tsum los yog muaj nyob hauv tej chaw uas nyob ib ncig yus ( environment ) . Dab tsi ua rau muaj tus mob ? Tus mob no yog tau los ntawm cov kab mob xws li cov enteroviruses ( tuag npab tuag ceg▹B-BP thiab tsis yog tuag npab tuag ceg▹B-BP ) thiab cov kab mob aviviruses xws li tus kab mob West Nile Virus , tus kab mob Japanese Encephalitis virus , los yog tus kab mob St . Louis encephalitis virus . Lwm cov kab mob uas tej zaum yuav ua rau muaj tus mob AFM yog cov herpesviruses ( piv txwv l

In [33]:
illness_tagged_tokenized = [w for line in illness_tagged for w in line.split(' ') if w[-3:] == '-IL']
symptom_tagged_tokenized = [w for line in symptom_tagged for w in line.split(' ') if w[-3:] == '-SY']

def get_separated(tagged_list):
    separated = []
    current = []
    for i, item in enumerate(tagged_list):
        if item[-4] == 'B':
            if len(current) == 1:
                if tagged_list[i - 1][-4] == 'B':
                    if current not in separated:
                        separated.append(current)
                    current = [item[:-5]]
            else:
                current.append(item[:-5])
        elif item[-4] != 'E':
            current.append(item[:-5])
        else:
            current.append(item[:-5])
            if current not in separated:
                separated.append(current)
            current = []
    return separated
        
illness_separated = get_separated(illness_tagged_tokenized)
symptom_separated = get_separated(symptom_tagged_tokenized)


In [34]:
print(illness_separated)

[['TUS', 'MOB', 'ENTEROTOXIGENIC', 'E', '.', 'COLI', '(', 'ETEC', ')']]


In [35]:
print(symptom_separated)

[['raws', 'plab'], ['mob', 'plab', 'ntswj'], ['ntuav'], ['muaj', 'dej', 'tsis', 'txaus', 'hauv', 'lub', 'cev'], ['tsis', 'muaj', 'zog'], ['ua', 'npaws'], ['kub', 'ib', 'ce']]


In [36]:
import sqlite3
os.chdir(os.path.expanduser(os.path.join('~', 'python_workspace', 'medical_corpus_scripting', 'ner_assets')))

conn = sqlite3.Connection('ontology.db')

crsr = conn.cursor()

#sql_query = """CREATE TABLE illness(
#illness_id INTEGER PRIMARY KEY AUTOINCREMENT,
#name TEXT);"""

#crsr.execute(sql_query)

In [12]:
sql_query = """CREATE TABLE symptom(
symptom_id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT);"""

crsr.execute(sql_query)

OperationalError: table symptom already exists

In [14]:
sql_query = """CREATE TABLE illness_symptom (
illness_id INTEGER NOT NULL,
symptom_id INTEGER NOT NULL,
PRIMARY KEY(illness_id, symptom_id));"""

crsr.execute(sql_query)

OperationalError: table illness_symptom already exists

In [38]:
for item in illness_separated:
    illness_name = ' '.join(item)
    crsr.execute("INSERT INTO illness (name) VALUES ('{il_name}');".format(il_name=illness_name))

In [16]:
for item in symptom_separated:
    symptom_name = ' '.join(item)
    crsr.execute("INSERT INTO symptom (name) VALUES ('{sy_name}');".format(sy_name=symptom_name))

In [17]:
for item in symptom_separated:
    symptom_name = ' '.join(item)
    illness_id = crsr.execute("SELECT illness_id FROM illness WHERE name='{il_name}';".format(il_name=illness_name)).fetchone()[0]
    symptom_id = crsr.execute("SELECT symptom_id FROM symptom WHERE name='{sy_name}'".format(sy_name=symptom_name)).fetchone()[0]
    crsr.execute("INSERT INTO illness_symptom VALUES ({il}, {sy});".format(il=illness_id, sy=symptom_id))

In [18]:
sql_query = """SELECT illness.illness_id, illness.name, symptom.symptom_id, symptom.name FROM illness_symptom
INNER JOIN illness ON illness.illness_id=illness_symptom.illness_id
INNER JOIN symptom ON symptom.symptom_id=illness_symptom.symptom_id;"""
print(crsr.execute(sql_query).fetchall())

[(1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 1, 'tuag npab tuag ceg'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 2, 'tsis muaj zog'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 3, 'raws sab npab los yog sab ceg tsis muaj zog cia li tshwm sim sai heev'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 4, 'tej nqaij tsis nruj li qub thiab xoob tuaj'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 5, 'lub ntsej muag nrwg los yog tsis muaj zog'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 6, 'tsis qab los'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 7, 'daim tawv muag nrwg'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 8, 'nqos tsis tau khoom noj los yog dej haus'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 9, 'txav tsis tau lub qhov muag mus los'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 10, 'hais lus txha'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 11, 'tsis muaj lub muaj log'), (1, 'TUS MOB ACUTE FLACCID MYELITIS ( AFM )', 12, 'ua tsis taus pa')]


In [37]:
sql_query = """BEGIN TRANSACTION;"""
crsr.execute(sql_query)

<sqlite3.Cursor at 0x7f4f5c1182d0>

In [42]:
for item in symptom_separated:
    symptom_name = ' '.join(item)
    sql_query = """INSERT INTO symptom (name) SELECT '{sy_name}' 
    WHERE NOT EXISTS(SELECT * FROM symptom WHERE name='{sy_name}');""".format(sy_name=symptom_name)
    crsr.execute(sql_query)

In [43]:
print(crsr.execute("SELECT * FROM symptom;").fetchall())

[(1, 'tuag npab tuag ceg'), (2, 'tsis muaj zog'), (3, 'raws sab npab los yog sab ceg tsis muaj zog cia li tshwm sim sai heev'), (4, 'tej nqaij tsis nruj li qub thiab xoob tuaj'), (5, 'lub ntsej muag nrwg los yog tsis muaj zog'), (6, 'tsis qab los'), (7, 'daim tawv muag nrwg'), (8, 'nqos tsis tau khoom noj los yog dej haus'), (9, 'txav tsis tau lub qhov muag mus los'), (10, 'hais lus txha'), (11, 'tsis muaj lub muaj log'), (12, 'ua tsis taus pa'), (13, 'raws plab'), (14, 'mob plab ntswj'), (15, 'ntuav'), (16, 'muaj dej tsis txaus hauv lub cev'), (17, 'ua npaws'), (18, 'kub ib ce')]


In [44]:
crsr.execute("COMMIT;")

<sqlite3.Cursor at 0x7f4f5c1182d0>

In [45]:
conn.commit()
conn.close()

In [8]:
# this is deprecated
def get_locations(sequence, to_match):
    end = 0
    locations = []
    while (end != len(sequence)):
        out = re.search(to_match, sequence[end:])
        if type(out) == re.Match:
            locations.append(end + out.span()[0])
            end += out.span()[1]
        else:
            end = len(sequence)
    return locations

print(get_locations(data[0], 'U'))
# the truth is, I only need one match to know whether I need to do sub, then I can do sub


[1, 10]


In [None]:
return_ne_tags(data, illness_names, 'IL')

In [30]:
basedir = os.path.expanduser(os.path.join('~', 'hmong_medical', 'raw_processing'))
os.chdir(basedir)

In [31]:
filenames = glob.glob('p*h.txt')
datasets = {}

In [32]:
for file in filenames:
    with open(os.path.join(basedir, file), 'r') as test_f:
        datasets[file] = [line.strip() for line in test_f.readlines()]

In [17]:
datasets['p42024h.txt']

['WISCONSIN DIVISION OF PUBLIC HEALTH Department of Health Services Mob Cab - Amebiasis ( amebic dysentary ) Disease Fact Sheet Series',
 'Mob Cab - Amebiasis zoo li cas ? Mob Cab Amebiasis yog tus kab mob cab me me hu ua Entamoeba histolytica ua mob cab rau hauv cov nhyuv . Nyob hauv lub xeev Wisconsin muaj kwv yees li 10 - 15 tus neeg raug tus kab mob cab no txhua xyoo . Leej twg thiaj raug hom cab amebiasis no ? Cov cab no yeej tsis xyeej leej twg , tab sis pom raug ntau rau cov neeg mus tsham teb chaws , los yog cov neeg tuaj ntawm cov teb chaws huab cua sov tuaj . Kuj pom muaj ntau rau cov teb chaws txom nyem nyob tsis huv . Tsis tas li , kuj pom muaj rau cov neeg ua nyob hauv cov tsev koom uas nyob tsis huv . Cov txiv neej ua deev txiv neej kuj yuav kis ua mob taus , tab sis , qee zaum mas kuj tsis pom lawv muaj tsos mob tshwm sim xwb . Cov cab Amebiasis ri tau li cas ? Thaum noj tau tej khoom noj tsis huv muaj cov qe cab mas yuav kis tau tus kab mob no . Thaum nphav raug tus nee

In [5]:
# folder must be ner_assets
def tag_files(datasets):
    disease_tag = 'DSYN'
    anatomy_tag = 'BPOC'
    symptom_tag = 'SOSY'
    disease_folder = 'disease_tagged'
    anatomy_folder = 'anatomy_tagged'
    symptom_folder = 'symptom_tagged'
    
    for item in datasets.keys():
        
        if type(datasets[item]) == str:
            datasets[item] = [datasets[item]]
            
        illness_tagged = tag_default(return_ne_tags(datasets[item], illness_names, disease_tag))
        if type(illness_tagged) == list:
            illness_tagged = '\n'.join(illness_tagged)
        if item not in os.listdir(disease_folder):
            with open(os.path.join('.', disease_folder, item), 'w') as f:
                f.write(illness_tagged)
            
        anatomy_tagged = tag_default(return_ne_tags(datasets[item], bodypart_names, anatomy_tag))
        if type(anatomy_tagged) == list:
            anatomy_tagged = '\n'.join(anatomy_tagged)
        if item not in os.listdir(anatomy_folder):
            with open(os.path.join('.', anatomy_folder, item), 'w') as f:
                f.write(anatomy_tagged)
            
        symptom_tagged = tag_default(return_ne_tags(datasets[item], symptom_names, symptom_tag))
        if type(symptom_tagged) == list:
            symptom_tagged = '\n'.join(symptom_tagged)
        if item not in os.listdir(symptom_folder):
            with open(os.path.join('.', symptom_folder, item), 'w') as f:
                f.write(symptom_tagged)

In [89]:
tag_files(raw)

In [67]:
os.chdir(os.path.expanduser(os.path.join('~', 'python_workspace', 'medical_corpus_scripting', 'ner_assets')))

In [68]:
import sqlalchemy

['WISCONSIN DIVISION OF PUBLIC HEALTH Department of Health Services Mob Cab - Amebiasis ( amebic dysentary ) Disease Fact Sheet Series',
 'Mob Cab - Amebiasis zoo li cas ? Mob Cab Amebiasis yog tus kab mob cab me me hu ua Entamoeba histolytica ua mob cab rau hauv cov nhyuv . Nyob hauv lub xeev Wisconsin muaj kwv yees li 10 - 15 tus neeg raug tus kab mob cab no txhua xyoo . Leej twg thiaj raug hom cab amebiasis no ? Cov cab no yeej tsis xyeej leej twg , tab sis pom raug ntau rau cov neeg mus tsham teb chaws , los yog cov neeg tuaj ntawm cov teb chaws huab cua sov tuaj . Kuj pom muaj ntau rau cov teb chaws txom nyem nyob tsis huv . Tsis tas li , kuj pom muaj rau cov neeg ua nyob hauv cov tsev koom uas nyob tsis huv . Cov txiv neej ua deev txiv neej kuj yuav kis ua mob taus , tab sis , qee zaum mas kuj tsis pom lawv muaj tsos mob tshwm sim xwb . Cov cab Amebiasis ri tau li cas ? Thaum noj tau tej khoom noj tsis huv muaj cov qe cab mas yuav kis tau tus kab mob no . Thaum nphav raug tus nee

In [83]:
import urllib.request

prefix = "http://github.com/nathanmwhite/hmong-medical-corpus/raw/master/medical_corpus_finalized/"
filenames = [str(n) + '.txt' for n in range(1, 12)]

content = {}

for item in filenames:
    print(item)
    with urllib.request.urlopen(prefix + item) as response:
        content[item] = response.read().decode()
        


1.txt
2.txt
3.txt
4.txt
5.txt
6.txt
7.txt
8.txt
9.txt
10.txt
11.txt


In [77]:
local_file = "source_locs.ind"
with urllib.request.urlopen(prefix + local_file) as response:
    file_ = response.read()
data = file_.decode()
data = [w.rstrip().split(' ') for w in data.split('\n') if len(w) > 0]
datanames = [os.path.basename(w[1])[:-3]+'txt' for w in data]
print(datanames)

['p42090h.txt', 'p00119h.txt', 'p00356h.txt', 'p00486h.txt', 'p00688h.txt', 'p00873h.txt', 'p42053ah.txt', 'activehmo.txt', 'p01298h.txt', 'p01581h.txt', 'p01709h.txt']


In [78]:
content['1.txt']

'Tus/B-CL Mob/B-NN –/O-PU Shigellosis/B-FW Disease/B-FW Fact/B-FW Sheet/B-FW Series/B-FW Tus/B-CL mob/B-NN shigellosis/B-FW zoo/B-VV li/B-PP cas/I-NN ?/O-PU Shigellosis/B-FW yog/B-VV ib/B-QU tug/I-CL mob/B-NN los/B-VV ntawm/B-LC cov/B-CL kab/B-NN mob/I-VV bacteria/B-FW los/B-VV ./O-PU Muaj/B-VV txog/B-PP li/B-PP 300/B-QU rau/B-PP 400/B-QU leej/B-CL neeg/B-NN tau/B-AD raug/B-VV tus/B-CL mob/B-NN no/B-DT txhua/B-QU xyoo/B-CL hauv/B-LC lub/B-CL xeev/B-NN Wisconsin/B-FW ./O-PU Feem/B-CL ntau/B-VV muaj/B-VV tshwm/B-VV sim/I-VV rau/B-PP lub/B-CL caij/B-NN ntuj/I-NN sov/I-VV thiab/B-CC lub/B-CL caij/B-NN nplooj/I-NN ntoo/I-NN zeeg/I-VV ./O-PU Nyob/B-VV nyob/I-VV mam/B-AD pom/B-VV tej/B-QU tus/B-CL neeg/B-NN raug/B-VV ,/O-PU los/B-CC puas/I-CC ,/O-PU ib/B-QU pab/B-CL pawg/I-CL neeg/B-NN coob/B-QU raug/B-VV ua/B-VV ib/B-QU vuag/B-CV ua/B-AD ke/I-AD kuj/B-AD muaj/B-VV ./O-PU Leej/B-CL twg/B-RL thiaj/B-AD li/I-AD yuav/B-AD kis/B-VV tau/B-VV tus/B-CL mob/B-NN shigellosis/B-FW ?/O-PU Leej/B-CL twg/

In [86]:
raw = {}
for item in content.keys():
    raw_data = content[item].replace('\n', '')
    raw_data = raw_data.replace('\r\n', '')
    raw_data = raw_data.replace('\\/', '***')
    raw_data = raw_data.replace('//', '***/')
    raw_data = ' '.join([w.split('/')[0] for w in raw_data.split(' ')])
    raw_data = raw_data.replace('***', '/')
    raw[datanames[int(item.split('.')[0]) - 1]] = [raw_data]

In [87]:
raw['p01581h.txt']

['TUS KAB MOB ZIKA : COV LUS POM ZOO SIV RAU COV NEEG UAS TEJ ZAUM TAU RAUG TUS KAB MOB ZIKA 1 . Nyob twj ywm hauv tsev kom txhob raug yoov tshaj cum tom los yog siv cov tshuaj pleev yoov kom txhob tom ntev li peb lim piam ( weeks ) . * Yog koj nyuam qhuav kis tau tus kab mob Zika tsis ntev los no , tej zaum yuav muaj tus kab mob no nyob hauv koj cov ntshav thiab yuav kis tau mus rau ib tus tshaj cum uas tom koj . Ces tus tshaj cum uas muaj tus mob yuav kis tau tus kab mob rau lwm tus . Qhov tiv thaiv koj kom tshaj cum txhob tom uas yog nyob twj ywm sab hauv tsev los yog siv cov tshuaj pleev yoov kom txhob tom mus ntev tsawg kawg peb lub lim piam tom qab kis tau tus kab mob tab txawm koj yuav tsis hnov tias koj mob . * Yog xav paub ntxiv seb yuav tiv thaiv koj tus kheej li cas kom tshaj cum txhob tom , thov mus saib http:/ . 2 . Zam txhob sib deev los yog siv hnab looj thaum sib deev . * Yog koj nyuam qhuav kis tau tus kab mob Zika tsis ntev los no , koj yuav kis tau tus kab mob mus ra

In [58]:
os.chdir(os.path.expanduser(os.path.join('~', 'python_workspace', 'medical_corpus_scripting', 'ner_assets')))

In [9]:
f = open(os.path.expanduser(os.path.join('~', 'hmong_medical', 'raw_processing', 'p02094h.txt')), 'r')
data = f.read()
raw_data = [data.replace('\n', ' ')]
f.close()
print(raw_data)

['ENTEROPATHOGENIC E . COLI ( EPEC ) Escherichia coli ( E . coli ) yog cov kab mob uas pom muaj nyob ib ncig yus , hauv tej khoom noj , thiab hauv tej tsiaj thiab tib neeg cov hnyuv . Feem ntau ntawm cov kab mob E . coli kuj tsis ua rau yus mob thiab tseem yog ib feem tseem ceeb ntawm txoj kev zom zaub mov , tab sis muaj ib cov kuj ua rau koj mob tau . Enteropathogenic E . coli ( EPEC ) yog ib hom ntawm cov kab mob E . coli uas ua tau rau koj mob raws plab . Dab tsi ua rau tau tus mob no ? EPEC yuav kis rau hauv tej khoom noj los yog dej uas muaj quav nyob hauv . Quav yuav nkag tau rau hauv cov khoom noj los yog dej thaum neeg tsis ntxuav lawv ob txhais tes kom zoo tom qab siv chav dej , ces cia los mus npaj khoom noj los yog dej haus . Nws kuj tseem tshwm sim tau rau tej qoob loo yog hais tias muab cov dej uas muaj quav nyob hauv los ywg rau . EPEC tseem kis tau los ntawm qhov mus kov tau ib tus tsiaj los yog ib tus neeg uas muaj cov kab mob EPEC , los yog kov tau ib yam khoom uas mua

In [11]:
tag_files({'p02094h.txt': raw_data})