In [None]:
from IPython.display import Markdown

import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

from pprint import pprint

In [None]:
data = pd.read_csv('../../data/data_raw/data.csv')
print(data.shape)
data.head()

In [None]:
data.species = data.species.str.strip()
data.species.value_counts().plot(kind='bar')

### <font color='red'> Adding column record_number for consistancy use </font>
We will use 'record_number' of labelling

In [None]:
cols = data.columns
cols = cols.insert(0,'record_number')
data['record_number'] = data.index
data = data.loc[:,cols]
data.head()

### filtering Out Non-Canine

In [None]:
data_canine = data.loc[data.species == 'Canine']
data_canine.shape

### View a Note

In [None]:
def view_note(record_number = None):
    i = np.random.choice(data_canine.record_number.values)
    if record_number:
        i = record_number
    print(f'Record Number # {i}')
    return Markdown(f'{data_canine.comment_text[i]}')

#view_note()

-----
-----

In [None]:
#Keywords relevent for particular category
search_words = {
    'diagnoses':['suspect','diagnosis','diagnose'],
    'problems':['comment','comments','problem', 'problems','questions', 'concerns'],
    'diagnostic_test':['test', 'diagnostic','procedure'],
    'physical_exam':['exam'],
    'history':['history']
}

# Lines needs to be removed from the text
removed_strings = [
    '[x if abnormal]',
    'please continue the same medication regimine for her future doctor visits to the hospital.',
    '<b>please contact us through phone 608-263-7600, or email at <b>primarycare@vetmed.wisc.edu</b> for any updates, questions, or concerns.</b>',
    'remember to either: <ul> <li>click the add rx button to insert medications dispensed today or <li>click the cpy comment button to insert the \
    current medication form for on-going medications</li></ul></font><!--  --></td></tr></tbody></table>',
    '[expand on above diagnoses as needed. do not simply reiterate. may include: summary of client discussion, differentials, and patient care.\
    *include alternative/future options for testing or treatment based on patient response.]',
    '<b>if you have a local veterinarian who referred you to uw veterinary care, we will send him/her a report for this visit.</b>',
    'please contact us via phone'
]

In [None]:
def find_features(cleaned_text,titles,keywords):
    lines = cleaned_text.split("\n")
    result_text = []
    for i,l in enumerate(lines):
        for keyword in keywords:
#             if keyword =='diagnoses':
#                 print(l)
            if l.find(keyword) > -1 and (l in titles or keyword in titles):
                next_lines = ''
                if i+1 < len(lines):
                    for j in range(1,len(lines)-i):
                        next_lines = lines[i+j]
                        if next_lines in titles:
                            break
                        else:
                            result_text.append(next_lines)
                
    return "".join(np.unique(result_text))

def process_random(idx = None, verbose = False):
    
    i = np.random.choice(data_canine.record_number.values)
    if idx != None:
        i = idx
        
    print(f'record_number # {i}','\n')
    comment_html_o = data_canine.comment_text[i]
    
    # If there is no note, resample till find a note
    if comment_html_o is np.nan:
        process_random(verbose = verbose)
    
    if verbose: print(comment_html_o)
    comment_html = comment_html_o.lower()
    comment_html = comment_html.replace("<b>","<b>\n")
    comment_html = comment_html.replace("</b>","\n</b>")
    comment_html = comment_html.replace("dr.","dr")
    comment_html = re.sub('<div>\[\s\]\s.*?</div>','',comment_html)
    
    for r_string in removed_strings:
        comment_html = comment_html.replace(r_string,'')
        #remove the entire line
        #TODO
        
    soup = BeautifulSoup(comment_html)
    
    titles = soup.findAll('b')
    titles = [t.text.strip().lower() for t in titles]

    
    soup_text = soup.text
    cleaned_text = re.sub('\[.*?\]', '', soup_text)
    cleaned_text = re.sub('\\xa0', '', cleaned_text)
    cleaned_text = re.sub('[ ]{2,}', '\n', cleaned_text)
    cleaned_text = cleaned_text.lower()
    
    if verbose:
        print('titles')
        print(titles,'\n')
    
    print('\n')
    print('History <Extracted>::')
    print(find_features(cleaned_text,titles,search_words['history']))
    
    print('\n')
    print('Physical Exam <Extracted>::')
    print(find_features(cleaned_text,titles,search_words['physical_exam']))
    
    print('\n')
    print('diagnoses <Extracted>::')
    print(find_features(cleaned_text,titles,search_words['diagnoses']))
    
    print('\n')
    print('Problems <Extracted>::')
    print(find_features(cleaned_text,titles,search_words['problems']))
    
    print('\n')
    print('Diagnostic Test <Extracted>::')
    print(find_features(cleaned_text,titles,search_words['diagnostic_test']))
    
    if verbose:
        #print(soup.prettify())
        print('\n')
        print("All Text Cleaned ::")
        print(cleaned_text)
        #It will print the original note
        print('\n')
        print(f'record_number # {i}','\n')
        return Markdown(f'{comment_html_o}')

In [None]:
process_random(idx = 0,verbose=False)

------
## Rest of the code

Single script for data cleaning

------

In [None]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup

search_words = {
    'diagnoses':['suspect','diagnosis','diagnose'],
    'problems':['comment','comments','problem'],
    'diagnostic_test':['test', 'diagnostic'],
    'physical_exam':['exam'],
    'history':['history']
}

# Lines needs to be removed from the text
removed_strings = [
    '[x if abnormal]',
    'please continue the same medication regimine for her future doctor visits to the hospital.',
    '<b>please contact us through phone 608-263-7600, or email at <b>primarycare@vetmed.wisc.edu</b> for any updates, questions, or concerns.</b>',
    'remember to either: <ul> <li>click the add rx button to insert medications dispensed today or <li>click the cpy comment button to insert the \
    current medication form for on-going medications</li></ul></font><!--  --></td></tr></tbody></table>',
    '[expand on above diagnoses as needed. do not simply reiterate. may include: summary of client discussion, differentials, and patient care.\
    *include alternative/future options for testing or treatment based on patient response.]',
    '<b>if you have a local veterinarian who referred you to uw veterinary care, we will send him/her a report for this visit.</b>',
    'please contact us via phone'
]


def process_data(train_path, output_path):
    
    #reading raw data
    data = pd.read_csv(train_path)
    data.species = data.species.str.strip()
    
    cols = data.columns
    cols = cols.insert(0,'record_number')
    data['record_number'] = data.index
    data = data.loc[:,cols]
    
    data_canine = data.loc[data.species == 'Canine']
    new_cols = ["Cleaned_Text", "History", "Physical_Exam", "Diagnoses", "Problems", "Diagnostic_Test"]
    data_canine[new_cols] = data_canine.apply(lambda x: process_row(x.comment_text), 
                                              axis = 1, result_type="expand")
    data_canine.to_csv(output_path, index = False)
    
def find_features_row(cleaned_text,titles,keywords):
    lines = cleaned_text.split("\n")
    result_text = []
    for i,l in enumerate(lines):
        for keyword in keywords:
            if l.find(keyword) > -1 and (l in titles or keyword in titles):
                next_lines = ''
                if i+1 < len(lines):
                    for j in range(1,len(lines)-i):
                        next_lines = lines[i+j]
                        if next_lines in titles:
                            break
                        else:
                            result_text.append(next_lines)
                
    return "".join(np.unique(result_text))

def process_row(comment_html_o):
    
    if comment_html_o is np.nan:
        return ('','','','','','')
    
    #if verbose: print(comment_html)
    comment_html = comment_html_o.lower()
    comment_html = comment_html.replace("<b>","<b>\n")
    comment_html = comment_html.replace("</b>","\n</b>")
    comment_html = comment_html.replace("dr.","dr")
    comment_html = re.sub('<div>\[\s\]\s.*?</div>','',comment_html)
    
    for r_string in removed_strings:
        comment_html = comment_html.replace(r_string,'')
        #remove the entire line
        #TODO
        
    soup = BeautifulSoup(comment_html)
    
    titles = soup.findAll('b')
    titles = [t.text.strip().lower() for t in titles]

    
    soup_text = soup.text
    cleaned_text = re.sub('\[.*?\]', '', soup_text)
    cleaned_text = re.sub('\\xa0', '', cleaned_text)
    cleaned_text = re.sub('[ ]{2,}', '\n', cleaned_text)
    cleaned_text = cleaned_text.lower()
    
    history = find_features_row(cleaned_text,titles,search_words['history'])
    physical_exam = find_features_row(cleaned_text,titles,search_words['physical_exam'])
    diagnoses = find_features_row(cleaned_text,titles,search_words['diagnoses'])
    problems = find_features_row(cleaned_text,titles,search_words['problems'])
    diagnostic_test = find_features_row(cleaned_text,titles,search_words['diagnostic_test'])
    
    return (cleaned_text,history, physical_exam, diagnoses, problems, diagnostic_test)
    
    
if __name__ == '__main__':
    process_data('../../data/data_raw/data.csv',
                '../../data/data_processed/data_processed.csv')

In [None]:
out_data = pd.read_csv('../../data/data_processed/data_processed.csv')
out_data.shape

In [None]:
out_data.tail()