# NLP file for medical consultations

## Part 1: prepare for loop over all .txt files
1. set up Stanford dependency parser.
2. create the positive & negative word lists:  general_inquirer_import.ipynb
3. create functions for: 1: tokenize // 2: tokenize & stem
## Part 2: LOOP - once for every .txt file
Loop over every text file and calculate results
## Part 3: After loop - summarise results
And correlate with speaker type

### User Input

In [None]:
#### Words to match on
list_words1 = ["i", "you", "we", "decide", "decision", "option", "options"] 
list_words2 = []

#### Speaker identifiers (these are followed by a colon at the start of each turn)
speaker_ids = ["P1", "N1", "D1", "D2", "R1", "R2"]

#### Convert speaker identities into lower case (the text file will also be converted to lower case)

In [None]:
speaker_ids = [x.lower() for x in speaker_ids]
speaker_ids

### import dependencies

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.parse.stanford import StanfordDependencyParser
from nltk.parse.stanford import StanfordParser
from bs4 import BeautifulSoup
import re
import os
import glob
import codecs
from sklearn import feature_extraction
import mpld3
from matplotlib import pyplot as plt
from itertools import zip_longest
import spacy
from collections import Counter
from nltk import sent_tokenize
from nltk import word_tokenize
import io
import xlrd
import pprint


### Python files used by this script

**sentiment_tokenize.py** <br>
**Output:** tokeniser functions e.g. tokenize_only // open_and_tokenize <br>

**sentiment_general_inquirer_import.ipynb** <br>
**Output:** produce 2 x lists of pos / neg words (called positive & negative)  <br>

**count_num_speakers.py** <br>
**Output:** 1x List: Num_speakers  <br>

               
**turns_per_speaker.py** <br>
**Output:** 1x Dict = Num_turns <br>
               

**pos_neg_score_append.py** <br>
**Output:** 2x Lists: positive_score_list.append(   <br>
negative_score_list  <br>


**pos_neg_words_set_freq.py** <br>
**Output:** 2x Dicts = pos_FreqDist_all_dialogue  &  neg_FreqDist_all_dialogue   <br>
2x Sets = pos_set_all_dialogue  &  neg_set_all_dialogue  <br>


**parts_of_speech.ipynb**  <br>
**Output: **  1x dictionary:  pos_consultation <br>


**spacy_dep_parse.ipynb**  <br>
**Output:** 1x dictionary:  dep_parse_consultation  <br>

**count_med_terms.ipynb** <br>
**Output:**  2x Dictionaries:  clin_concepts_number_per_consultation  clin_classes_type_number_per_consultation   <br>
2x Sets:  clin_classes_set_final  clin_classes_set

**loop_per_speaker.ipynb** <br>
**Output:** 3x dictionaries: 1: per_speaker (contains most of per speaker results   <br>
2: clin_concepts_number_per_consultation_speaker    3: clin_classes_type_number_per_consultation_speaker  <br>

**loop_per_speaker_inner_loop.ipynb** <br>
**Output:** Produces the POS / sentiment / active passive verb analysis for the foregoing file (loop_per_speaker.ipynb)

#### configure to display results of all content of cells

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### display the environment variables

In [None]:
print(os.environ['CLASSPATH'])

### Methodology to create dictionary for results. i.e. link participant identifiers with metrics.


In [None]:
#### Specify 2 x types of metrics:
metrics = ["Num_turns_", "Num_words_"]

#### List to store the metrics:
metrics_speaker = []

#### Combine the speaker IDs with the types of metrics:
for a in metrics:
    for x in speaker_ids:
        full = a + x
        metrics_speaker.append(full)

#### Convert list into dictionary
values = np.zeros(len(metrics_speaker))
metrics_speaker_dict  = dict(zip(metrics_speaker, values))

print("Print Dictionary Contents")
metrics_speaker_dict




### spacy preparation & demo

In [None]:
nlp = spacy.load('en')

In [None]:
doc = nlp("I cured the illness. The illness was cured by me. A red cup. I ran quickly.")
### Note: apply to a tokenised document:   new_doc = nlp(str(tokens))

counter = 0

while counter < len(list(doc.sents)):
    for word in list(doc.sents)[counter]:
        print("Word:", word.text)     
        print("Tag:", word.tag_)
        print("Head:", word.head.text)
        print( "Dependency relation:", word.dep_)
        print( "Children:", list(word.children))
        print("")
    counter += 1


In [None]:
# With this function in our toolbox, we can write a loop that prints out the subtree for each word in a sentence:

def flatten_subtree(st):
    return ''.join([w.text_with_ws for w in list(st)]).strip()

In [None]:
# Retrieve the noun subjects relating to active and passive phrases.

subjects_list = []
for word in doc:
    print(word.dep_)
    if word.dep_ == ('nsubj'):
        subjects_list.append(flatten_subtree(word.subtree))
    if word.dep_ == ('nsubjpass'):
        subjects_list.append(flatten_subtree(word.subtree))
print("subjects_list: {}".format(subjects_list))

In [None]:
#### Retrieve the noun subjects for active / passive phrases SEPARATELY (and count the number of each)

nsubj_subjects = []
nsubj_count    = 0
for word in doc:
    if word.dep_ == ('nsubj'):
        nsubj_count += 1
        nsubj_subjects.append(flatten_subtree(word.subtree))
print("nsubj_subjects: {}".format(nsubj_subjects))
print("nsubj_count: {}".format(nsubj_count))

nsubjpass_subjects = []
nsubjpass_count = 0
for word in doc:
    if word.dep_ == ('nsubjpass'):
        nsubjpass_count += 1
        nsubjpass_subjects.append(flatten_subtree(word.subtree))
print("nsubjpass_subjects: {}".format(nsubjpass_subjects))
print("nsubjpass_count: {}".format(nsubjpass_count))

### Demo of lexical dispersion plot and collocation / hapaxes

In [None]:
ref = open('text4.txt')
type(ref)
raw = ref.read()
type(raw)
tokens = word_tokenize(raw)
type(tokens)

text = nltk.Text(tokens)
type(text)

#### Produce a dispersion plot of some key words (can change the words as you like, e.g. include key clinical terms)

In [None]:
text.dispersion_plot(["name", "sugar", "family", "exercise"])


#### Look at collocations (words that frequently occur beside each other)
#### look at words that appear in a similar context to 'pain'
#### look at shared contexts between the words medicine and pain

In [None]:
print("Collocations")
text.collocations()

print()
print("Words in similar contexts")
text.similar("sugar")

print()
print("Common contexts")
text.common_contexts(["sugar", "exercise"])
	#How to examine just the contexts that are SHARED by TWO or more words?

#### Examine amount of vocabulary and the repetition of vocabulary

In [None]:
print("Length of text")
len(text)
	#includes punctuation
	

print()
print("Size of vocabulary")
vocab_size = len(set(text))
	#size of vocabulary.
vocab_size

print()
print("Lexical richness")
len(set(text)) / len(text)
	#LEXical richness.
	
print("What percentage of text is taken up by the word 'exercise'?")
freq_word = 100 * text.count('exercise') / len(text)
freq_word


#### Look at the most common words, the frequency of specific words; and words that only appear once ('hapaxes')

In [None]:
fdist1 = nltk.FreqDist(text)  
print(fdist1)  
    ###<FreqDist with 19317 samples and 260819 outcomes>
fdist1.most_common(50) 
	#print 50 x most common tokens
		
fdist1['doctor']
	#freq of particular word.

fdist1.plot(50, cumulative=True)
	#cumulative frequency plot for top 50 words
	
	
fdist1.hapaxes()
	#hapaxes ie words occuring only once
	

#### Count the number of questions: assume that each '?' indicates one question

In [None]:
for x in ['?']:
    print('Number of questions: ', fdist1[x])

### Create the 2 x lists of positive & negative words from General Inquirer


In [None]:
%run -i sentiment_general_inquirer_import.ipynb

#### print sample of first and last positive & negative words

In [None]:
print("Positive sample")
positive[:5]
positive[-5:]

print()
print("Negative sample")
negative[:5]
negative[-5:]

### Define tokenizer and stemmer functions

In [None]:
%run -i sentiment_tokenize.py


#### Assert tokenizer works.

In [None]:
print("1: tokenize_only")
test = "Hi my friends"
tokenize_only(test)

print("")
print("2: open_and_tokenize")
##################  loop over this & increment counter +1 each time.
counter = 0
folder  = "sentiment_text_data"
open_and_tokenize(counter, folder)


In [None]:
%run -i create_scores.py

#### Check number of text files in the directory

In [None]:
no_files = len(os.listdir(folder))
print("Number of files in the directory: {}".format(no_files))

# Part 2: Loop (i.e. for every text file in folder)


### Analyse content - use procedural coding to avoid issues re local / global variables in functional approach

In [None]:
counter = 0

# %run -i loop_per_speaker.ipynb

#while counter < no_files:
#    tokens = open_and_tokenize(counter, folder)
#    counter += 1


In [None]:

#########################################################
#########################################################  SCRIPTS WITH THEIR OWN INTERNAL LOOP.
#########################################################

############################################################  Medical terminology
print("count_med_terms.ipynb")
%run -i  count_med_terms.ipynb

"""
Count the number of medical concepts:
Output: 2x Dict:
 clin_concepts_number_per_consultation
 clin_classes_type_number_per_consultation
"""

############################################################  Loop over each speaker.
print("loop_per_speaker.ipynb")
%run  -i loop_per_speaker.ipynb


In [None]:




#########################################################
#########################################################  SCRIPTS THAT GO INSIDE THE LOOP.
#########################################################

counter = 0

while counter < no_files:
    tokens = open_and_tokenize(counter, folder)

    ############################################################  Number speakers & words & turns
    %run -i count_num_speakers.py
    """
    Output = List    Num_speakers
    if the speaker id appears in the text ('tokens') - +1 to last value in list
    """

    %run -i turns_per_speaker.py
    """
    Output = dictionary    Num_turns
    Each time speaker id appears at start of sentence: +1 to final value for the speaker 
    """

    ############################################################  Positive & Negative sentiment
    %run -i pos_neg_score_append.py
    """
    Output = 2xLists   positive_score_list & negative_score_list
    count number of pos & neg words in each transcript 'tokens' - add 1 number to relevant list
    """
    
    %run -i pos_neg_words_set_freq.py
    """
    Output = 2xLists & 2xDicts   
    Lists:  pos_set_all_dialogue          neg_set_all_dialogue
    Dicts:  pos_FreqDist_all_dialogue     neg_FreqDist_all_dialogue
    1. count freq of all pos / neg words across all files.
    2. create a set showing each pos / neg word once
    """

    ############################################################  Parts of speech
    %run -i parts_of_speech.ipynb
    """
    Output: 1x dictionary
    pos_consultation
    """
    
    ############################################################  Active & Passive phrases
    %run -i spacy_dep_parse.ipynb
    """
    Output: 1x dictionary
    dep_parse_consultation
    """

    
    ############################################################  Count specific words & phrases specified in user input
    %run -i count_words_phrases.py
 
    counter += 1


# Part 3: after loop: create summary metrics & show results

## Per Consultation

In [None]:

### total no. pos / neg sentiment words per consultation
print(positive_num)
print(negative_num)


### 
print(positive_score_list)
print(negative_score_list)


### no. of unique pos and neg sentiment words
print(len(pos_set_all_dialogue))
print(len(neg_set_all_dialogue))


### sets of unique pos and neg sentiment words
print(pos_set_all_dialogue)
print(neg_set_all_dialogue)


### Freq Dist pos and neg sentiment words
pos_FreqDist_all_dialogue
neg_FreqDist_all_dialogue


### speakers & turns
Num_speakers
Num_turns


### clin concepts & classes
clin_concepts_number_per_consultation
clin_classes_type_number_per_consultation

#### Print results of: Positive & Negative word set, Pos & Neg frequency distribution

In [None]:
pos_set_all_dialogue
%run -i sentiment_pos_neg_results.py

#### Print results of: Turn taking: number of words and sentences per speaker

In [None]:
%run sentiment_turn_taking_results.py

#### Print results of: Active & Passive noun dependencies

In [None]:
%run sentiment_active_passive.py

## Per Speaker

In [None]:
print("Number of clinical concepts per consultation")
clin_concepts_number_per_consultation_speaker

print("How many times each classification appeared in each transcript")
clin_classes_type_number_per_consultation_speaker


In [None]:
### Active & passive verbs /// Parts of speech (adjective, adverb, verb, personal pronoun) 
    ###     /// Positive and negative sentiment words
per_speaker