# NLTK Cheatsheet

## Import Libraries

In [1]:

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

## Tokenization

In [2]:

text = "NLTK is a leading platform for building Python programs to work with human language data."
# Sentence tokenization
sent_tokens = sent_tokenize(text)
print("Sentence Tokenization:", sent_tokens)

# Word tokenization
word_tokens = word_tokenize(text)
print("Word Tokenization:", word_tokens)


Sentence Tokenization: ['NLTK is a leading platform for building Python programs to work with human language data.']
Word Tokenization: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'programs', 'to', 'work', 'with', 'human', 'language', 'data', '.']


## Stopwords Removal

In [3]:

# Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]
print("Filtered Words:", filtered_words)


Filtered Words: ['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human', 'language', 'data', '.']


## Frequency Distribution

In [4]:

# Frequency distribution
fdist = FreqDist(word_tokens)
print("Frequency Distribution:", fdist)
print("Most Common Words:", fdist.most_common(5))


Frequency Distribution: <FreqDist with 16 samples and 16 outcomes>
Most Common Words: [('NLTK', 1), ('is', 1), ('a', 1), ('leading', 1), ('platform', 1)]


## Stemming

In [5]:

# Stemming
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in word_tokens]
print("Stemmed Words:", stemmed_words)


Stemmed Words: ['nltk', 'is', 'a', 'lead', 'platform', 'for', 'build', 'python', 'program', 'to', 'work', 'with', 'human', 'languag', 'data', '.']


## Lemmatization

In [6]:

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
print("Lemmatized Words:", lemmatized_words)


Lemmatized Words: ['NLTK', 'is', 'a', 'leading', 'platform', 'for', 'building', 'Python', 'program', 'to', 'work', 'with', 'human', 'language', 'data', '.']


## Part of Speech Tagging

In [7]:

# Part of speech tagging
pos_tags = pos_tag(word_tokens)
print("Part of Speech Tags:", pos_tags)


Part of Speech Tags: [('NLTK', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('leading', 'VBG'), ('platform', 'NN'), ('for', 'IN'), ('building', 'VBG'), ('Python', 'NNP'), ('programs', 'NNS'), ('to', 'TO'), ('work', 'VB'), ('with', 'IN'), ('human', 'JJ'), ('language', 'NN'), ('data', 'NNS'), ('.', '.')]


## Named Entity Recognition

In [8]:

# Named entity recognition
named_entities = ne_chunk(pos_tags)
print("Named Entities:", named_entities)


Named Entities: (S
  (ORGANIZATION NLTK/NNP)
  is/VBZ
  a/DT
  leading/VBG
  platform/NN
  for/IN
  building/VBG
  (PERSON Python/NNP)
  programs/NNS
  to/TO
  work/VB
  with/IN
  human/JJ
  language/NN
  data/NNS
  ./.)


## Synonyms and Antonyms

In [9]:

# Synonyms and antonyms
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())

print("Synonyms of 'good':", set(synonyms))
print("Antonyms of 'good':", set(antonyms))


Synonyms of 'good': {'beneficial', 'sound', 'respectable', 'honest', 'in_effect', 'undecomposed', 'skillful', 'commodity', 'adept', 'trade_good', 'well', 'soundly', 'salutary', 'upright', 'secure', 'practiced', 'in_force', 'full', 'unspoilt', 'serious', 'skilful', 'effective', 'expert', 'goodness', 'honorable', 'safe', 'just', 'right', 'ripe', 'proficient', 'good', 'near', 'unspoiled', 'thoroughly', 'estimable', 'dear', 'dependable'}
Antonyms of 'good': {'evilness', 'evil', 'badness', 'bad', 'ill'}


## WordNet

In [10]:

# WordNet
synsets = wordnet.synsets("program")
print("Synsets of 'program':", synsets)
print("Definition of the first synset:", synsets[0].definition())
print("Examples of the first synset:", synsets[0].examples())


Synsets of 'program': [Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]
Definition of the first synset: a series of steps to be carried out or goals to be accomplished
Examples of the first synset: ['they drew up a six-step plan', 'they discussed plans for a new bond issue']
