# NLTK Demo

## Tokenizing

### Sentence tokenizing

In [None]:
!pip install nltk
!pip install matplotlib

In [None]:
import nltk

nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("maxent_ne_chunker")
nltk.download("words")
nltk.download("book")

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize


example_string = """
Muad'Dib learned rapidly because his first training was in how to learn.
And the first lesson of all was the basic trust that he could learn.
It's shocking to find how many people do not believe they can learn,
and how many more believe learning to be difficult."""
sent_tokenize(example_string)

### Word tokenizing

In [None]:
words = word_tokenize(example_string)
words

In [None]:
german_string = """
Muad'Dib lernte schnell, denn seine erste Ausbildung galt dem Lernen.
Und die erste Lektion überhaupt war das Urvertrauen, das er lernen konnte.
Es ist schockierend zu sehen, wie viele Menschen nicht glauben, dass sie lernen können,
Und wie viele glauben noch, dass Lernen schwierig sei?
"""
word_tokenize(german_string,language='german')

## Stopwords

In [None]:
from nltk.corpus import stopwords

stop_words = [word for word in stopwords.words()]
filtered_list = [word for word in words if word.lower() not in stop_words]
filtered_list

## Stemming

In [None]:
from nltk.stem import (
    PorterStemmer,
    LancasterStemmer
)


porter = PorterStemmer()
lancaster = LancasterStemmer()

string_for_stemming = """
The crew of the USS Discovery discovered many discoveries.
Discovering is what explorers do."""

porter_stemmed_words = [porter.stem(word) for word in word_tokenize(string_for_stemming)]
lancaster_stemmed_words = [lancaster.stem(word) for word in word_tokenize(string_for_stemming)]

In [None]:
porter_stemmed_words

In [None]:
lancaster_stemmed_words

## POS Tagging

In [None]:
import nltk


(pos_tags := nltk.pos_tag(words))

In [None]:
# list of POS tags
nltk.help.upenn_tagset()

## Lemmatizing

In [None]:
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()

[lemmatizer.lemmatize(word) for word in word_tokenize(string_for_stemming)]

In [None]:
lemmatizer.lemmatize("worst")

In [None]:
lemmatizer.lemmatize("worst", pos="a")

## Chunking

In [None]:
from nltk.draw.util import CanvasFrame
from nltk.draw import TreeWidget

lotr_quote = "It's a dangerous business, Frodo, going out your door."
pos_tags = nltk.pos_tag(word_tokenize(lotr_quote))
# optional determiner, any number of adjectives, one noun
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunk_parser = nltk.RegexpParser(grammar)
tree = chunk_parser.parse(pos_tags)

# Tkinter necessary
#tree.draw()

In [None]:
from IPython.display import Image, display
display(Image(filename='tree.png')) # image drawn by Tkinter

## Named Entity Recognition

In [None]:
quote = """
Men like Schiaparelli watched the red planet—it is odd, by-the-bye, that
for countless centuries Mars has been the star of war—but failed to
interpret the fluctuating appearances of the markings they mapped so well.
All that time the Martians must have been getting ready.

During the opposition of 1894 a great light was seen on the illuminated
part of the disk, first at the Lick Observatory, then by Perrotin of Nice,
and then by other observers. English readers heard of it first in the
issue of Nature dated August 2."""


def extract_ne(quote):
    words = word_tokenize(quote, language='english')
    tags = nltk.pos_tag(words)
    tree = nltk.ne_chunk(tags, binary=True)
    return set(
        " ".join(i[0] for i in t)
        for t in tree
        if hasattr(t, "label") and t.label() == "NE"
    )


extract_ne(quote)

## Concordance

In [None]:
from nltk.book import *


text8.concordance("man")

In [None]:
text8.concordance("woman")

## Dispersion plots

In [None]:
%matplotlib inline
text4.dispersion_plot(["America", "democracy", "freedom", "duties", "citizens"])

## Frequency distribution

In [None]:
from nltk import FreqDist

meaningful_words = [word for word in text8 if word.isalpha() and word.lower() not in stop_words]
fq = FreqDist(meaningful_words)
fq.most_common(20)

In [None]:
fq.plot(20, cumulative=False)

## Collocations

In [None]:
text8.collocations()

In [None]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in text8]
new_text = nltk.Text(lemmatized_words)
new_text.collocations()