In [185]:
# An example of commonly used NLTK functions.
# More info at http://www.nltk.org/py-modindex.html

import nltk
# Uncomment the download dialouge in case you don't have the corpora
#nltk.download()

from nltk.book import * #This says "from NLTK's book module, load all items."
# It is just a small module for educational purpose in chapter 1 of the NLTK tutorial
# In this code we teplicate the classes that has been used in nltk.book module
from nltk.corpus import gutenberg, webtext
from __future__ import print_function

from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams


#Loading
###########################################


#Note: This first part is mostly useful to work with the nltk buit-in corpora.
# E.g. num_sents = len(gutenberg.sents('austen-emma.txt'))

# To use the external text corpora see the cell "text object and external text"
######################
######################
gutenberg_fileids = gutenberg.fileids()
webtext_fileids = webtext.fileids()

print('File ids of the gutenberg corpus:\n{}\n'.format(gutenberg.fileids()))
print('File ids of the webtext corpus:\n{}\n'.format(webtext.fileids()))

#Selecting two sample text
text1_id = gutenberg.fileids()[0]
text2_id= webtext.fileids()[1]
print('Text1 id is: ',text1_id)
print('Text2 id is: ',text2_id)
text1_list=gutenberg.words(text1_id)
text2_list=webtext.words(text2_id)
print()

#Tokenizing
###########################################

#Text lengths
print('Text1 {} lenght: {} tokens'.format(text1_id, len(text1_list)))
print('Text2 {} lenght: {} tokens'.format(text2_id, len(text2_list)))
print()

#Printing a part of sample texts
print('Text1_list : ',text1_list[0:10])
print('Text2_list : ',text2_list[0:10])
print()

#Accesing
print('Index of the token "flower" in text1_list: ', text1_list.index('flower'))
print('Token number 110552 in text1_list: ', text1_list[110552])
print()

#Using nltk raw string
text1_raw=gutenberg.raw(text1_id)
text1_string=" ".join(text1_list)
print('Text1_raw, first 100 characters of joined :\n',text1_raw[291:400])
print('Text1_string, first 100 characters of joined :\n',text1_string[291:400])
print()

#Resplitting
text1_string_resplit=text1_string.split()
#print('Text1_list : ',text1_list[50:60])
#print('Text1_string_resplit : ',text1_string_resplit[50:60])
#print()

File ids of the gutenberg corpus:
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

File ids of the webtext corpus:
['firefox.txt', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', 'wine.txt']

Text1 id is:  austen-emma.txt
Text2 id is:  grail.txt

Text1 austen-emma.txt lenght: 192427 tokens
Text2 grail.txt lenght: 16967 tokens

Text1_list :  ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
Text2_list :  ['SCENE', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop', 'clop']

Index of the token "flower" in text1_list:  110552
Token number 110552 in text1_list:  flower

Text1_raw, first 10

In [182]:
#Counting
###########################################

#Note: For counting the characters you must use e.g. raw text. If your text string is made by the join() method 
#      from a list, your text will have extra whitespace. e.g.   I asked her: "how are you?"
#                                                         vs     I asked her : " how are you ? "
######################
######################
print('Characters number in raw text {}: '.format(text1_id), len(gutenberg.raw(text1_id)))
print('Characters number in rejoined text string: ', len(text1_string)) #reminder: text1_string=" ".join(text1_list)
print()

print('Tokens number in raw text {}: '.format(text1_id), len(gutenberg.words(text1_id))) 
print()

print('Tokens number in raw text {}: '.format(text1_id), len(gutenberg.sents(text1_id))) 
print('Tokens number in this rejoined text: ', len(text1_string_resplit))

#notice that set return a list of unique tokens
print('Unique tokens number in text1 is: ',len(set(text1_list))) #notice that set return a list of unique tokens
print('Occurence number of the word "good" in text1 is: ',text1_list.count('good'))
print()

num_chars = len(gutenberg.raw('austen-emma.txt'))
num_words = len(gutenberg.words('austen-emma.txt'))
num_sents = len(gutenberg.sents('austen-emma.txt'))


fd = nltk.FreqDist(text1_list) # creates a new data object that contains information about word frequency
print('Type of the variable "fd" :', type(fd))
print('Frequency of the word "good" in text1 is:', fd['good'])
#fd.keys(), fd.values(), fd.items()
#print(gutenberg.raw(text1_id)[0:100])

print('Frequency of the word "good" in text1 is:', fd.N())

Characters number in raw text austen-emma.txt:  887071
Characters number in rejoined text string:  915041

Tokens number in raw text austen-emma.txt:  192427

Tokens number in raw text austen-emma.txt:  7752
Tokens number in this rejoined text:  192427
Unique tokens number in text1 is:  7811
Occurence number of the word "good" in text1 is:  340

Type of the variable "fd" : <class 'nltk.probability.FreqDist'>
Frequency of the word "good" in text1 is: 340
Frequency of the word "good" in text1 is: 192427


In [187]:
#Text objects / working with out of corpora texts
###########################################

#Crating an instance of the NLTK text object
#text list
raw_text_sample=text1_string_resplit
#nltk object
text_object=Text(raw_text_sample)

#Concordance
###########################################

#7 concordances of the word "precisely":
print('7 concordance of the word "precisely":')
text_object.concordance(word='precisely',width=60, lines=7)

#Similarity - find words that are in common context
###########################################
print()
print('"precisely" similar contexs:')
text_object.similar('precisely')

#Common context
###########################################
print()
print('"head" and "hart" common contexs (tokens in left and rightside):')
text_object.common_contexts(['head','heart'])

7 concordance of the word "precisely":
Displaying 7 of 7 matches:
osity . The yeomanry are precisely the order of people with
old you yesterday he was precisely the height of Mr . Perry
mbled there , consisting precisely of those whose society w
r stay at home ." It was precisely what Emma would have wis
s time with us . This is precisely what I wanted . Well , p
rank the whole spring -- precisely the season of the year w
an , and his manners are precisely what I like and approve 

"precisely" similar contexs:
just to in at exactly perhaps by for more now however so that not all
on this over gone always

"head" and "hart" common contexs (tokens in left and rightside):
his_at his_and her_was my_and my_i own_and
