In [1]:
# Using built NLTK built-in text corpus and its methods.
# You can load and use your own external texts. See the part "Load your own text" 
# More info at https://www.nltk.org/book/ch02.html

import nltk
# Uncomment the download dialouge in case you don't have the corpora
#nltk.download()

#from nltk.book import * #This says "from NLTK's book module, load all items."
# It is just a small module for educational purpose in chapter 1 of the NLTK tutorial
# In this code we teplicate the classes that has been used in nltk.book module
from nltk.corpus import gutenberg, webtext
from __future__ import print_function

from nltk.text import Text
from nltk.probability import FreqDist
from nltk.util import bigrams
from nltk import word_tokenize


#Loading
###########################################

gutenberg_fileids = gutenberg.fileids()
webtext_fileids = webtext.fileids()

print('File ids of the gutenberg corpus:\n{}\n'.format(gutenberg.fileids()))
print('File ids of the webtext corpus:\n{}\n'.format(webtext.fileids()))

#Selecting two sample text
text1_id = gutenberg.fileids()[0]
text2_id= webtext.fileids()[1]
print('Text1 id is: ',text1_id)
print('Text2 id is: ',text2_id)
text1_words=gutenberg.words(text1_id)
text2_words=webtext.words(text2_id)
print()

#Printing a part of sample texts
print('Text1_words : ',text1_words[0:10])
print('Text2_words : ',text2_words[0:10])
print()

#Accesing
print('Index of the token "flower" in text1_words: ', text1_words.index('flower'))
print('Token number 110552 in text1_words: ', text1_words[110552])
print()

#Using nltk raw string
#Notice the difference between raw text and rejoined text1_words. The later one has extra whitespaces and no line break.
text1_raw=gutenberg.raw(text1_id)
text1_string=" ".join(text1_words)
print('Text1_raw, first 100 characters:\n',text1_raw[0:100])
print()
print('Text1_string, made with join() method has extra whitespaces:\n',text1_string[0:100])
print()

File ids of the gutenberg corpus:
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

File ids of the webtext corpus:
['firefox.txt', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', 'wine.txt']

Text1 id is:  austen-emma.txt
Text2 id is:  grail.txt

Text1_words :  ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', 'VOLUME', 'I', 'CHAPTER']
Text2_words :  ['SCENE', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop', 'clop']

Index of the token "flower" in text1_words:  110552
Token number 110552 in text1_words:  flower

Text1_raw, first 100 characters:
 [Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhous

In [2]:
#Tokenizing and Counting
###########################################

#Note: For counting the characters you must use e.g. raw text. If your text string is made by the join() method 
#      from a list, your text will have extra whitespace. e.g.   I asked her: "how are you?"
#                                                         vs     I asked her : " how are you ? "
######################
######################

#Text lengths
characters_number = len(gutenberg.raw(text1_id))
words_number = len(gutenberg.words(text1_id))
sentences_number = len(gutenberg.sents(text1_id))

print('Characters number of text1_raw {}: {}'.format(text1_id, characters_number))
print('Words number of text1_raw {}: {} '.format(text1_id, words_number))
print('Sentences number of text1_raw {}: {} '.format(text1_id, sentences_number))
print()
print('Characters number of text1_string {}: {}'.format(text1_id, len(text1_string)))
print()


#notice that set return a list of unique tokens
print('Unique tokens number in text1_words is: ',len(set(text1_words))) #notice that set return a list of unique tokens
print('Occurence number of the word "good" in text1_words is: ',text1_words.count('good'))
print()
print('Type of the variable "text1_raw" :', type(text1_raw))
#Tokenizing with python split() method
print('Tokens number of text1_raw {} using split(): {} '.format(text1_id, len(text1_raw.split())))
#Tokenizing with python nltk.word_tokenize
print('Tokens number of text1_raw {} using nltk.word_tokenize: {} '.format(text1_id, len(word_tokenize(text1_raw))))
print()

fd = nltk.FreqDist(text1_words) # creates a new data object that contains information about word frequency
print('Type of the variable "fd" :', type(fd))
print('Frequency of the word "good" in text1 is:', fd['good'])
#fd.keys(), fd.values(), fd.items()
#print(gutenberg.raw(text1_id)[0:100])

Characters number of text1_raw austen-emma.txt: 887071
Words number of text1_raw austen-emma.txt: 192427 
Sentences number of text1_raw austen-emma.txt: 7752 

Characters number of text1_string austen-emma.txt: 915041

Unique tokens number in text1_words is:  7811
Occurence number of the word "good" in text1_words is:  340

Type of the variable "text1_raw" : <class 'str'>
Tokens number of text1_raw austen-emma.txt using split(): 158167 
Tokens number of text1_raw austen-emma.txt using nltk.word_tokenize: 191673 

Type of the variable "fd" : <class 'nltk.probability.FreqDist'>
Frequency of the word "good" in text1 is: 340


In [3]:
#Text objects / Concordance, similarity, common_contexts
###########################################

#Creating text list
print('Type of the variable "text1_words" :', type(text1_words))
text1_list=word_tokenize(text1_raw)
print('Type of the variable "text1_list" :', type(text1_list))
print()

#Crating an instance of the NLTK text object
text_object=Text(text1_list)

#Concordance
###########################################

#7 concordances of the word "precisely":
print('7 concordance of the word "precisely":')
text_object.concordance(word='precisely',width=60, lines=7)

#Similarity - find words that are in common context
###########################################
print()
print('"precisely" similar contexs:')
text_object.similar('precisely')

#Common context
###########################################
print()
print('"head" and "hart" common contexs (tokens in left and rightside):')
text_object.common_contexts(['head','heart'])

Type of the variable "text1_words" : <class 'nltk.corpus.reader.util.StreamBackedCorpusView'>
Type of the variable "text1_list" : <class 'list'>

7 concordance of the word "precisely":
Displaying 7 of 7 matches:
osity . The yeomanry are precisely the order of people with
old you yesterday he was precisely the height of Mr. Perry 
mbled there , consisting precisely of those whose society w
stay at home . '' It was precisely what Emma would have wis
s time with us . This is precisely what I wanted . Well , p
rank the whole spring -- precisely the season of the year w
an , and his manners are precisely what I like and approve 

"precisely" similar contexs:
just to in at exactly perhaps by for more now however so that not all
on this over gone always

"head" and "hart" common contexs (tokens in left and rightside):
his_at his_and her_was my_and my_i own_and
