# Analyzing Medical Questions – Team 6

This notebook is to analyze questions asked by health care providers ('providers') and the general public ('patient') so as to glean insight from both the provider and patient viewpoints.

* <a href="https://raw.githubusercontent.com/nina-lin/NLTK-analysis/main/data/providers.txt" target="_blank">provider.txt</a> contains data <a href="https://github.com/nina-lin/NLTK-analysis/blob/main/xml-extract.ipynb" target="_blank">extracted</a> from the <a href="https://datadiscovery.nlm.nih.gov/dataset/Clinical-Questions-Collection/i3a4-n4ma" target="_blank">National Institute of Health's Clinical Questions Collection (1999 - 2003)</a>, a “repository of questions that have been collected between 1991 – 2003 from healthcare providers in clinical settings across the country.”


* <a href="https://raw.githubusercontent.com/nina-lin/NLTK-analysis/main/data/patients.txt" target="_blank">patient.txt</a> contains questions of a medical nature asked by the general public from two sources:
    1. The <a href="https://raw.githubusercontent.com/curai/medical-question-pair-dataset/master/mqp.csv" target="_blank">Medical Question Pairs (MQP) Dataset</a>, a list of patient-asked questions randomly sampled from a crawl of HealthTap
    2. Patient-asked questions <a href="https://github.com/nina-lin/NLTK-analysis/blob/main/scraper.ipynb" target="_blank">scraped</a> from www.thecorrect.com

<em>Code snippets adapted from <a href="https://github.com/gcrocetti" target="_blank">Giancarlo Crocetti's</a> CUS635 web scraping materials.</em>

In [None]:
import nltk
from nltk import FreqDist
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
#Read datasets 
path = 'data/'  
filePrefix = ''
categories = ['patients','providers']
dataset = {}
dataset_raw = {}
allFeatures = set()
questions = 0
question_count = {}

N={} # Number of questions in each corpus

for category in categories:
    fileName = path + filePrefix + category.lower() + '.txt'
    f = open(fileName,'r')
    text = ''
    text_raw = ''    
    lines = f.readlines()
    questions += len(lines)
    question_count[category] = len(lines)
    dataset_raw[category] = list(map(lambda line: line.lower(), lines))
    
    for line in lines:
        text += line.replace('\n',' ').lower()
        text_raw = line.lower()
    f.close
    N[category] = len(lines)
    
    #create tokens
    tokens = nltk.word_tokenize(text)
    dataset[category] = nltk.Text(tokens)

## Preprocessing

At this point, the dataset of questions asked by patients have been spellchecked.

In [None]:
providersFD = FreqDist(dataset['providers'])
patientsFD = FreqDist(dataset['patients'])

#### Removing Punctuation & Stopwording

In [None]:
punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"
print ('Punctuation FD[providers] FD[patients]')
for punct in punctuations:
    print ('   {}  {:3d}   {:3d}'.format(punct,providersFD[punct], patientsFD[punct]))


In [None]:
from nltk.corpus import stopwords
dsCleaned = {} #cleaned dataset

def removePunctuation(corpus):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filteredCorpus = [token for token in corpus if (not token in punctuations)]
    return filteredCorpus

# #note that token.decode('utf-8') was dropped for Python 3. decode ('utf-8') will throw an error. Please use the below snippet only with Python 2.
# def apply_stopwording(corpus, min_len):
#     filtered_corpus = [token.decode('utf-8') for token in corpus if (not token.decode('utf-8') in stopwords.words('english') and len(token)>min_len)]
#     return filtered_corpus

def stopwording(corpus, min_len):
    filteredCorpus = [token for token in corpus if (not token in stopwords.words('english') and len(token) > min_len)]
    return filteredCorpus


In [None]:
#remove punct & apply stopwording
for category in categories:
    print ('Processing %s' % category)
    dsCleaned[category] = stopwording(removePunctuation(dataset[category]), 3)
    print (dsCleaned[category])

#### Lemmatization

In [None]:
#nltk.download('wordnet')
dsFinal={} #working dataset (final)

# def stemming(corpus):
#     stemmer = nltk.PorterStemmer()
#     normalized_corpus = [stemmer.stem(token) for token in corpus]
#     return normalized_corpus

def lemmatization(corpus):
    lemmatizer = nltk.WordNetLemmatizer()
    normalized_corpus = [lemmatizer.lemmatize(token) for token in corpus]
    return normalized_corpus

for category in categories:
    print ('Processing %s' % category)
    dsFinal[category] = lemmatization(dsCleaned[category])
    print (dsFinal[category])

#### Build Vocabulary

In [None]:
vocabulary = {}
for category in categories: 
    vocabulary[category] = sorted(set(dsFinal[category]))
    print ('vocabulary for %s = [%s]' % (category, vocabulary[category]))

### Simple Analysis

#### Get Lexical Diversity

In [None]:
print ('Patient questions have a total of %s tokens and a vocabulary size of %s' % (len(dsFinal['patients']), len(vocabulary['patients'])))
print ('Provider questions have a total of %s tokens and a vocabulary size of %s' % (len(dsFinal['providers']), len(vocabulary['providers'])))

def lexical_diversity(text):
    return len(text)*1.0/len(set(text))

lexDiversity = {}
for category in categories:
    lexDiversity[category] = lexical_diversity(dsFinal[category])
    print ('Lexical Diversity in %s = %s' % (category,lexDiversity[category]))

#### Counting Words

In [None]:
# print ('# time "disease" is used by health care providers %s' % ds['providers'].count('disease'))
# print ('# time "disease" is used by patients %s' % ds['patients'].count('disease'))

In [None]:
count = {}

# for token in vocabulary[category]:
#     count[token] = dsFinal[category].count(token)

for token in vocabulary['providers']:
    count[token] = dsFinal['providers'].count(token)
    
for w in sorted(count, key = count.get, reverse=True):
    print (w, count[w])   

In [None]:
count = {}

for token in vocabulary['patients']:
    count[token] = dsFinal['patients'].count(token)

for w in sorted(count, key = count.get, reverse=True):
    print (w, count[w])

#### Frequency Distribution

In [None]:
def getFrequent(freq, n):
    result = {}
    index = 0
    for i in sorted(freq, key = freq.get, reverse = True):
        index += 1
        result[i] = freq[i]
        if index == n:
            break
    return result 

frequency = nltk.FreqDist(dsFinal[category])

topTokens = getFrequent(frequency, 50)

print(topTokens)

#### Finding Important Words

In [None]:
providersFD = FreqDist(dsFinal['providers'])
patientsFD = FreqDist(dsFinal['patients'])

In [None]:
print(FreqDist(dsFinal[category]))

In [None]:
providersFD.plot(20, cumulative = False, title = 'Provider Tokens');
patientsFD.plot(20, cumulative = False, title = 'Patient Tokens');

Extract important words and phrases, filtering out words that are shorter than 5 characters in length to ignore preopositions, determiners, and interrogative words ("who", "what")

In [None]:
for token in dsFinal['patients']:
    if (len(token) >= 5):
        print ('%s [%s]' % (token, patientsFD[token]))

In [None]:
for token in dsFinal['providers']:
    if (len(token) >= 5):
        print ('%s [%s]' % (token, patientsFD[token]))
        

#### Collocations, 2-grams & Co-Occurences

In [None]:
dsFinal['patients']
dsFinal['patients'] = nltk.Text(dsFinal['patients'])
dsFinal['patients'].collocation_list()

In [None]:
dsFinal['providers']
dsFinal['providers'] = nltk.Text(dsFinal['providers'])
dsFinal['providers'].collocation_list()

In [None]:
from nltk.collocations import *
from nltk.util import ngrams

print ('Generating bigrams')
bigrams = ngrams(dsFinal['patients'],2)
for bigram in bigrams:
    print (bigram)

In [None]:
print ('Generating bigrams')
bigrams = ngrams(dsFinal['providers'],2)
for bigram in bigrams:
    print (bigram)

In [None]:
bigram = nltk.collocations.BigramAssocMeasures()
trigram = nltk.collocations.TrigramAssocMeasures()

# Finding frequent 2-grams
print ('Finding frequent 2-grams')
for category in categories:
    finder = BigramCollocationFinder.from_words(dsFinal[category])
    finder.apply_freq_filter(5)
    tokens = finder.nbest(bigram.pmi, 20)
    print (tokens)
    
# Finding frequent 3-grams
print ('\nFinding frequent 3-grams')
for category in categories:
    finder = TrigramCollocationFinder.from_words(dsFinal[category])
    finder.apply_freq_filter(5)
    tokens = finder.nbest(trigram.pmi, 20)
    print (tokens)

#### Lexical Resource

In [None]:
def proportion_cleantext(corpus, language):
    stopwords = nltk.corpus.stopwords.words(language)
    cleantext = [token for token in corpus if token not in stopwords]
    return len(cleantext)*1.0/len(corpus)

language='english'
for category in categories:
    print ("Proportion of clean terms in the [%s] is %s" % (category,proportion_cleantext(dataset[category],language)))


## Classification

### Naive Bayes

Define all functions needed for the implementation of the Naive Bayes classifier

<em>(Note that 

* tot_articles = questions 
* articles_count = question_count

as adapted from <strong>Improving Classification of Web Articles's</strong> NB classifier for our own use)</em>

In [None]:
feature_count = {}
category_count = {}
probCat = {}

# Calculate the probabilities for each category
for category in categories:
    probCat[category] = question_count[category] * 1.0 / questions
    print ("%s - p(%s)=%s" % (category, category, probCat[category]))

In [None]:
freqWord = {}
wordCounts = {}

def buildFrequencies(dsFinal):
    for category in categories:
        freqWord[category] = FreqDist(dsFinal[category])
        wordCounts[category] = len(dsFinal[category])
        
buildFrequencies(dsFinal)

In [None]:
sampleWord = 'blood'

print (f"Checking Frequencies for the word '{sampleWord}':")
print ("F('blood'|'providers') = %s" % freqWord['providers'][sampleWord])
print ("F('blood'|'patients') = %s" % freqWord['patients'][sampleWord])