# Step 3: Quality Control

## Setup

In [1]:
pip install nltk



In [None]:
import re
import pandas as pd
import numpy as np
from spellchecker import SpellChecker
from master_functions import *


In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Suppress false positive warnings
import warnings
warnings.filterwarnings("ignore")

#### Load Datasets

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/lisavirginia/clinical-abbreviations/master/code/Step2Output.csv',
                 sep='|',
                 header=0,
                 index_col=False,
                 na_filter=False,
                 dtype=object)

In [None]:
df.sample(3, random_state=0)

In [None]:
df.shape

## Identify Errors

#### Heuristic 1: Duplicates

Identify which records exactly duplicate another record from the same source.

In [None]:
Extract1 = df[df.duplicated(['SF', 'LF', 'Source']) == True]
Extract1.shape

#### Heuristic 2: Punctuation

Identify excess punctuation in the long form (e.g. "nitric oxide;").

In [None]:
# Punctuation after LF (excludes .+%()[])
Extract2_1 = df[df['LF'].str.contains('.*[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\"]$') == True]
Extract2_1.shape

In [None]:
# Punctuation before LF (excludes .+%()[])
Extract2_2 = df[df['LF'].str.contains('^[,\/#!\$\^&@\?<>\*:;{}=\-_\'~\"].*') == True]
Extract2_2.shape

Identify excess punctuation in the short form (e.g. "..IVF").

In [None]:
# Excess periods before SF
Extract2_3 = df[df['SF'].str.contains('^[\.]+.*') == True]
Extract2_3.shape

#### Heuristic 3: Spelling

The long form contains spelling errors (e.g. "cncer").

In [None]:
# Set spell checker parameters
spell = SpellChecker(distance=1)

# Add medical word corpus (UMLS Metathesaurus)
spell.word_frequency.load_text_file('data/ClinSpell.txt')

# Exclude UMLS and ADAM
subset = df[(df['Source'] != 'UMLS') &
            (df['Source'] != 'ADAM')]

# Instantiate output
misspelled_rows = []
misspelled_data = []

In [None]:
# Iterate over subset
for index, row in subset.iterrows():

    # Format LF for spellchecker
    pre_token = re.sub('[^A-Za-z\s\-]+', '', row['LF']).lower()
    token = list(filter(None, re.split(r'[\s\-]+', pre_token)))

    # Identify misspelled LFs
    misspelled = spell.unknown(token)
    if len(misspelled) > 0:
        misspelled_rows.append(row['RecordID'])
        misspelled_data.append(misspelled)

In [None]:
# Extract misspelled LFs
Extract3 = df[df['RecordID'].isin(misspelled_rows)]
Extract3.shape

#### Heuristic 4: Content

The alphabetic characters in the short form don't occur anywhere in the long form.

In [None]:
# Include problematic sources
subset = df[(df['Source'] == 'Vanderbilt Clinic Notes') |
            (df['Source'] == 'Vanderbilt Discharge Sums')]

# Instantiate output
missing_character = []
missing_char_data = []

In [None]:
# Iterate over dataframe
for index, row in subset.iterrows():

    # Extract alphabetic characters
    alph_SF = set(re.sub('[^A-Za-z]+', '', row['SF']).lower())
    alph_LF = set(re.sub('[^A-Za-z]+', '', row['LF']).lower())

    if alph_SF.issubset(alph_LF) == False:
        if (alph_SF - alph_LF) != {'x'}:
            missing_character.append(row['RecordID'])
            missing_char_data.append(alph_SF - alph_LF)

In [None]:
# Extract LFs missing characters
Extract4 = df[df['RecordID'].isin(missing_character)]
Extract4.shape

#### Heuristic 5: User-Identified

In [None]:
Extract5 = df[(df['LF'].str.contains("#000066") |
              df['LF'].str.contains("typo") |
              df['LF'].str.contains("not an abbreviation") |
              df['LF'].str.contains("not an acronym"))]
Extract5.shape

## Format

#### Add Columns

In [None]:
# Error type, decision, modification
Extract1['error'], Extract1['action'] = ["duplicate", "retire"]
Extract2_1['error'], Extract2_1['action'] = ["punctuation after LF", "modify"]
Extract2_2['error'], Extract2_2['action'] = ["punctuation before LF", "modify"]
Extract2_3['error'], Extract2_3['action'] = ["punctuation before SF", "modify"]
Extract3['error'], Extract3['action'] = [misspelled_data, "modify"]
Extract4['error'], Extract4['action'] = [missing_char_data, "modify"]
Extract5['error'], Extract5['action'] = ["user identified", "retire"]

#### Merge

In [None]:
errors = pd.concat([Extract1, Extract2_1, Extract2_2, Extract2_3, Extract3, Extract4, Extract5])
errors.shape

In [None]:
errors = errors.drop_duplicates(subset="RecordID")
errors.shape

#### Export

In [None]:
errors.to_csv('data/Errors_Automated.csv',
              index=False,
              header=True,
              sep='|')

## Import Errors

#### Import Annotated

In [None]:
errors = pd.read_csv('data/Errors_Annotated.csv',
                     sep='|',
                     header=0,
                     index_col=False,
                     na_filter=False,
                     dtype=object)

In [None]:
errors.sample(3, random_state=0)

In [None]:
errors.shape

In [None]:
errors['action'].value_counts()

#### Remove None

In [None]:
errors = errors[(errors['action'] != 'none')]
errors.shape

#### Subset Crosswalk

In [None]:
df_all = df # Keep unsubsetted version
df = df[~df['RecordID'].isin(errors['RecordID'])]
df.shape

#### Subset Errors

In [None]:
retire = df_all[df_all['RecordID'].isin(errors[(errors['action'] == 'retire')]['RecordID'])]
retire.shape

In [None]:
modify = errors[(errors['action'] == 'modify')].iloc[:, 0:19]
modify.shape

## Modify

#### Retire Duplicates

In [None]:
# Identify duplicates
dups = pd.concat([df, modify])
dups = dups[dups.duplicated(['SF', 'LF', 'Source']) == True]
dups.shape

In [None]:
# Remove from modify
modify = modify[~modify['RecordID'].isin(dups['RecordID'])]
modify = modify.reset_index(drop=True)
modify.shape

In [None]:
# Add to retire
retire = pd.concat([retire, df_all[df_all['RecordID'].isin(dups['RecordID'])]])
retire = retire.reset_index(drop=True)
retire.shape

#### Strip Source Data

This is done as the source data is potentially no longer valid.

In [None]:
modify['SFUI'], modify['NormSF'], modify['NSFUI'], modify['PrefSF'] = ['', '', '', '']
modify['LFUI'], modify['NormLF'], modify['PrefLF'], modify['SFEUI'] = ['', '', '', '']
modify['LFEUI'], modify['Type'], modify['Score'], modify['Count'] = ['', '', '', '']
modify['Frequency'], modify['UMLS.CUI'] = ['', '']

In [None]:
modify.sample(3, random_state=0)

#### Reassign Normalized Short Form

In [None]:
modify['NormSF'] = modify['SF'].apply(normalized_short_form)

In [None]:
modify.sample(3, random_state=0)

#### Reassign SFUI

In [None]:
# Search existing SFUIs
for index, row in modify.iterrows():
    temp = df_all[(df_all['SF'] == modify['SF'].iat[index])]
    if temp.empty:
        modify['SFUI'].iat[index] = ''
    else:
        modify['SFUI'].iat[index] = temp.iloc[0]['SFUI']

In [None]:
# If none, add SFUI
modify = add_new_SFUI(modify)
modify.sample(3, random_state=0)

#### Reassign LFUI

In [None]:
# Search existing LFUIs
for index, row in modify.iterrows():
    temp = df_all[(df_all['LF'] == modify['LF'].iat[index])]
    if temp.empty:
        modify['LFUI'].iat[index] = ''
    else:
        modify['LFUI'].iat[index] = temp.iloc[0]['LFUI']

In [None]:
# If none, add LFUI
modify = add_new_LFUI(modify)
modify.sample(3, random_state=0)

# Keyword Extraction
Here We are using the keyword extraction as in natural language processing (NLP), keyword extraction serves a variety of purposes, including: Document Summary: Summarize content by identifying and extracting important topics.

Information Retrieval:  Improve search engine performance by indexing and retrieving documents based on  keywords.
Content Tagging: Categorize and tag content for better organization and structure.
Content Recommendations: Improve personalized content recommendations by analyzing user requests.
Social Media Monitoring:  Monitor trends, sentiment, and popular topics on  social media platforms.
SEO: Optimize your web content by identifying and incorporating relevant keywords to improve search engine rankings.
Market Research: Analyze customer feedback and opinions to understand market trends.
Legal Analytics: Extract keywords from  legal documents for faster analysis and compliance monitoring.
Text Clustering:  Group similar documents based on  extracted keywords.
Question Answering: Improve the accuracy of question answering systems by identifying relevant keywords.
Healthcare Text Mining: Extract keywords from biomedical literature and medical records for research and analysis.


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist

In [None]:
def extract_keywords(df):
    # Tokenize the text into sentences
    sentences = sent_tokenize(df)
    stop_words = set(stopwords.words('english')) #Tokenizing each sentence into words
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.isalnum() and word.lower() not in stop_words]
    word_freq = FreqDist(words) #This calculates the Frequency distribution on the words
    keywords = [word for word, freq in word_freq.items() if freq > 1]

    return keywords

In [None]:
if __name__ == "__main__":
    # Example text (replace this with your actual text)
    text = """
    Hey there! I've been diving into Natural Language Processing (NLP), which is this super cool field in artificial intelligence all about how computers and humans can chat using regular language. NLP techniques help computers analyze, understand, and even generate human language in a way that makes sense and fits the context.
    I'm particularly excited about keyword extraction – it's this really important job in NLP. What it does is figure out the most important words or phrases in a piece of text. """

    keywords = extract_keywords(text)  # Extracting keywords from the example text
    print("Keywords:", keywords)     # Printing the extracted keywords


#### Add "Modified" Column

In [None]:
modify["Modified"] = "modified"
df["Modified"] = ""

#### Append to Crosswalk

In [None]:
df = pd.concat([df, modify])
df = df.sort_values(by=['RecordID'])
df = df.reset_index(drop=True)
df.shape

## Export

#### Export Modify

In [None]:
# Get original rows
modify = df_all[df_all['RecordID'].isin(modify['RecordID'])]
modify.shape

In [None]:
modify.to_csv('ModifiedRecords.csv',
              index=False,
              header=True,
              sep='|')

#### Export Retire

In [None]:
retire.to_csv('RetiredRecords.csv',
              index=False,
              header=True,
              sep='|')

#### Export Crosswalk

In [None]:
df.to_csv('Step3Output.csv',
          index=False,
          header=True,
          sep='|')