In [6]:
import nltk
import nltk.corpus
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Shakespeare

In [7]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [8]:
caesar=nltk.corpus.gutenberg.words('shakespeare-caesar.txt')

In [11]:
filedata=caesar.read(lines)

AttributeError: 'StreamBackedCorpusView' object has no attribute 'read'

In [28]:
string='''
The best and most beautiful things in the world 
cannot be seen or even touched, they must be felt with the heart
'''

In [29]:
quotes_tokens=nltk.word_tokenize(string)
quotes_tokens

['The',
 'best',
 'and',
 'most',
 'beautiful',
 'things',
 'in',
 'the',
 'world',
 'can',
 'not',
 'be',
 'seen',
 'or',
 'even',
 'touched',
 ',',
 'they',
 'must',
 'be',
 'felt',
 'with',
 'the',
 'heart']

In [30]:
from nltk.util import bigrams,ngrams,trigrams

In [31]:
quotes_bigrams=list(nltk.bigrams(quotes_tokens))

In [9]:
quotes_bigrams

[('The', 'best'),
 ('best', 'and'),
 ('and', 'most'),
 ('most', 'beautiful'),
 ('beautiful', 'things'),
 ('things', 'in'),
 ('in', 'the'),
 ('the', 'world'),
 ('world', 'can'),
 ('can', 'not'),
 ('not', 'be'),
 ('be', 'seen'),
 ('seen', 'or'),
 ('or', 'even'),
 ('even', 'touched'),
 ('touched', ','),
 (',', 'they'),
 ('they', 'must'),
 ('must', 'be'),
 ('be', 'felt'),
 ('felt', 'with'),
 ('with', 'the'),
 ('the', 'heart')]

In [10]:
quotes_trigrams=list(nltk.trigrams(quotes_tokens))

In [11]:
quotes_trigrams

[('The', 'best', 'and'),
 ('best', 'and', 'most'),
 ('and', 'most', 'beautiful'),
 ('most', 'beautiful', 'things'),
 ('beautiful', 'things', 'in'),
 ('things', 'in', 'the'),
 ('in', 'the', 'world'),
 ('the', 'world', 'can'),
 ('world', 'can', 'not'),
 ('can', 'not', 'be'),
 ('not', 'be', 'seen'),
 ('be', 'seen', 'or'),
 ('seen', 'or', 'even'),
 ('or', 'even', 'touched'),
 ('even', 'touched', ','),
 ('touched', ',', 'they'),
 (',', 'they', 'must'),
 ('they', 'must', 'be'),
 ('must', 'be', 'felt'),
 ('be', 'felt', 'with'),
 ('felt', 'with', 'the'),
 ('with', 'the', 'heart')]

# PorterStemmer

In [12]:
from nltk.stem import PorterStemmer
pst=PorterStemmer()

In [13]:
pst.stem("having")

'have'

In [14]:
words_to_stem="give given giving gave".split()

In [15]:
for word in words_to_stem:
    print(word,":",pst.stem(word))

give : give
given : given
giving : give
gave : gave


# LancasterStemmer

In [16]:
from nltk.stem import LancasterStemmer
lst=LancasterStemmer()

In [17]:
for word in words_to_stem:
    print(word,":",lst.stem(word))

give : giv
given : giv
giving : giv
gave : gav


# SnowballStemmer

In [18]:
from nltk.stem import SnowballStemmer

In [19]:
sbs=SnowballStemmer("english")

In [20]:
for word in words_to_stem:
    print(word,":",sbs.stem(word))

give : give
given : given
giving : give
gave : gave


# Lemmatization

In [32]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
word_lem=WordNetLemmatizer()

In [33]:
for words in words_to_stem:
    print(words,":",word_lem.lemmatize(word))

give : gave
given : gave
giving : gave
gave : gave


# Parts of Speech

In [41]:
sent="Timothy is a natural when it comes to drawing."
sent2="Too bad I won't leave long enough to see her again."
sent_token=nltk.word_tokenize(sent)
sent2_token=nltk.word_tokenize(sent2)

In [None]:
sent_

In [42]:
for token in sent_token:
    print(nltk.pos_tag([token]))

[('Timothy', 'NN')]
[('is', 'VBZ')]
[('a', 'DT')]
[('natural', 'JJ')]
[('when', 'WRB')]
[('it', 'PRP')]
[('comes', 'VBZ')]
[('to', 'TO')]
[('drawing', 'VBG')]
[('.', '.')]


In [43]:
for token in sent2_token:
    print(nltk.pos_tag([token]))

[('Too', 'NN')]
[('bad', 'JJ')]
[('I', 'PRP')]
[('wo', 'MD')]
[("n't", 'RB')]
[('leave', 'VB')]
[('long', 'RB')]
[('enough', 'RB')]
[('to', 'TO')]
[('see', 'VB')]
[('her', 'PRP$')]
[('again', 'RB')]
[('.', '.')]


# Named Entity Recognition (NER)

In [44]:
from nltk import ne_chunk

In [53]:
NE_sent="The US President stays in the White House in Baltimore"

In [54]:
ne_token=nltk.word_tokenize(NE_sent)
ne_pos=nltk.pos_tag(ne_token)

In [55]:
ne_pos

[('The', 'DT'),
 ('US', 'NNP'),
 ('President', 'NNP'),
 ('stays', 'VBZ'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('White', 'NNP'),
 ('House', 'NNP'),
 ('in', 'IN'),
 ('Baltimore', 'NNP')]

In [56]:
ne_ner=ne_chunk(ne_pos)
print(ne_ner)

(S
  The/DT
  (ORGANIZATION US/NNP)
  President/NNP
  stays/VBZ
  in/IN
  the/DT
  (FACILITY White/NNP House/NNP)
  in/IN
  (GPE Baltimore/NNP))


In [57]:
#test 2

In [58]:
NE_sent="Mert went to college in Turkey"
ne_token=nltk.word_tokenize(NE_sent)
ne_pos=nltk.pos_tag(ne_token)
ne_ner=ne_chunk(ne_pos)
print(ne_ner)

(S
  (PERSON Mert/NNP)
  went/VBD
  to/TO
  college/NN
  in/IN
  (GPE Turkey/NNP))
