#### Introductory code: configure this notebook to display all output

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### Introductory code: import functions

In [None]:
import nltk, re, pprint
from nltk import word_tokenize
from nltk import sent_tokenize

#### Open a transcript file called "text1.txt": this should be saved in the current working directory. The next few lines then "tokenise" this, i.e. convert into the right format for analysis.

In [None]:
ref = open('text1.txt')
type(ref)
raw = ref.read()
type(raw)
tokens = word_tokenize(raw)
type(tokens)

text = nltk.Text(tokens)
type(text)




#### Produce a dispersion plot of some key words (can change the words as you like, e.g. include key clinical terms)

In [None]:
text.dispersion_plot(["name", "pain", "medicine"])



#### Look at collocations (words that frequently occur beside each other)
#### look at words that appear in a similar context to 'pain'
#### look at shared contexts between the words medicine and pain

In [None]:
print("Collocations")
text.collocations()

print()
print("Words in similar contexts")
text.similar("pain")

print()
print("Common contexts")
text.common_contexts(["pressure", "pain"])
	#How to examine just the contexts that are SHARED by TWO or more words?




#### Examine amount of vocabulary and the repetition of vocabulary

In [None]:
print("Length of text")
len(text)
	#includes punctuation
	

print()
print("Size of vocabulary")
vocab_size = len(set(text))
	#size of vocabulary.
vocab_size

print()
print("Lexical richness")
len(set(text)) / len(text)
	#LEXical richness.
	
print("What percentage of the text is taken up by a specific word?")
freq_word = 100 * text.count('a') / len(text)
freq_word
	


#### Look at the most common words, the frequency of specific words; and words that only appear once ('hapaxes')

In [None]:
fdist1 = nltk.FreqDist(text)  
print(fdist1)  
    ###<FreqDist with 19317 samples and 260819 outcomes>
fdist1.most_common(50) 
	#print 50 x most common tokens
		
fdist1['doctor']
	#freq of particular word.

fdist1.plot(50, cumulative=True)
	#cumulative frequency plot for top 50 words
	
	
fdist1.hapaxes()
	#hapaxes ie words occuring only once
	


#### Produce a list of modal words (e.g. 'could', 'may', 'might') and count their frequency

In [None]:
fdist = nltk.FreqDist(w.lower() for w in text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', fdist[m], end=' ')

		
		


#### Count the number of questions: assume that each '?' indicates one question

In [None]:
for x in ['?']:
    print('Number of questions: ', fdist[x])

#### Stemming versus Lemmatisation (part 1: stemming)

In [None]:
	##### STEMMING V LEMMATISATION:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
porter_stems = [porter.stem(t) for t in tokens]

##### First 10 items in list of porter stems.
porter_stems[:10]

#####  50 most common items in porter stem frequency distribution.
porter_freq = nltk.FreqDist(porter_stems)
porter_freq.most_common(50) 


In [None]:
lancaster_stems = [lancaster.stem(t) for t in tokens]

##### First 10 items in list of porter stems.
lancaster_stems[:10]

#####  50 most common items in lancaster stem frequency distribution.
lancaster_freq = nltk.FreqDist(lancaster_stems)
lancaster_freq.most_common(50) 

"""
    Stemming = strip off the affixes (e.g. "lying" becomes "lie" // government becomes govern // strange becomes strang)
	Here we try 2 of them. Porter & Lancaster.
	Stemming is NOT a well-defined process, and we typically pick the stemmer that best suits the application we have in mind. 
    The Porter Stemmer is a good choice if you are indexing some texts and want to SUPPORT SEARCH USING ALTERNATIVE FORMS OF WORDS.
"""



#### Stemming versus Lemmatisation (part 2: lemmatisation)

In [None]:
print("50 x first lemmas in the text.")
wnl = nltk.WordNetLemmatizer()
wnl_list = [wnl.lemmatize(t) for t in tokens]
wnl_list[:50]

print()
print("50 x most common lemmas")
wnl_freq = nltk.FreqDist(wnl_list)
wnl_freq.most_common(50)

"""
	Lemmatisation = ensure the word is recognised in the dictionary (lemma / headword)
	The WordNet lemmatizer is a good choice if you want to compile the vocabulary of some texts and want a list of valid lemmas (or lexicon headwords).
"""
	
	



#### Attach the Part of speech to each word (e.g. nouns, adjectives)

In [None]:
#### list of tuples: each tuple contains a word & pos
text_pos = nltk.pos_tag(text)
type(text_pos)
text_pos[:10]

#### convert list of tuples into Dictionary: keys are unique, values needn't be.
text_pos_dict = dict(text_pos)
type(text_pos_dict)
text_pos_dict

	#get info on any POS tags.
nltk.help.upenn_tagset()
nltk.help.upenn_tagset("JJ")

#### Count the frequency of each part of speech: e.g. adjectives and adverbs

In [None]:
pos_count = Counter(text_pos_dict.values())
pos_count


#### Create separate dictionaries for some parts of speech: facilitate visual scrutiny

In [None]:

print("")
print("CC coordinating conjunction (and but either both neither)")
coordinating_conjunction_dict = {k : v for k,v in text_pos_dict.items() if v in ["CC"]}
coordinating_conjunction_dict
coordinating_conjunction_list = list(coordinating_conjunction_dict.keys())
coordinating_conjunction_list


print("")
print("JJ JJR JJS  adjective")
adjective_dict                = {k : v for k,v in text_pos_dict.items() if v in ["JJ"] or v in ["JJR"] or v in ["JJS"]}
adjective_dict
adjective_list = list(adjective_dict.keys())
adjective_list


print("")
print("NN NNP NNPS NNS noun")
noun_dict                = {k : v for k,v in text_pos_dict.items() if v in ["NN"] or v in ["NNP"] or v in ["NNPS"] or v in ["NNS"]}
noun_dict
noun_list = list(noun_dict.keys())
noun_list


print("")
print("PRP personal pronoun")
personal_pronoun_dict = {k : v for k,v in text_pos_dict.items() if v in ["PRP"]}
personal_pronoun_dict
personal_pronoun_list = list(personal_pronoun_dict.keys())
personal_pronoun_list


print("")
print("PRP$ pronoun possessive (her mine my)")
possessive_pronoun_dict = {k : v for k,v in text_pos_dict.items() if v in ["PRP$"]}
possessive_pronoun_dict
possessive_pronoun_list = list(possessive_pronoun_dict.keys())
possessive_pronoun_list


print("RB RBR RBS adverb")
print("")
adverb_dict                = {k : v for k,v in text_pos_dict.items() if v in ["RB"] or v in ["RBR"] or v in ["RBS"]}
adverb_dict
adverb_list = list(adverb_dict.keys())
adverb_list

print("")
print("UH: interjection   Goodbye Goody Gosh Wow Jeepers Jee-sus")
interjection_dict = {k : v for k,v in text_pos_dict.items() if v in ["UH"]}
interjection_dict
interjection_list = list(interjection_dict.keys())
interjection_list


