# NLP basics tutorial extended

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

from wordcloud import WordCloud

import spacy
from spacy import displacy
import en_core_web_sm

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
nlp = en_core_web_sm.load()

In [None]:
# Creating a simple text
text = open("./data/news.txt").read()

In [None]:
# Sentence tokenization
sent_tokenize(text)[:10]

In [None]:
# Word tokenization
words = word_tokenize(text)

In [None]:
words[:10]

In [None]:
# Words distribution
frec_dist = FreqDist(words)

In [None]:
# Printing the most common words
frec_dist.most_common(10)

In [None]:
# Stop words
# Stop words are basically a set of commonly used words in any language, not just English.
# The reason why stop words are critical to many applications is that, if we remove the words that are very commonly used in a given language, we can focus on the important words instead.

stop_words = set(stopwords.words("english"))

In [None]:
# Removing stop words
filtered_words = []

for w in words:
    if w.lower() not in stop_words:
        filtered_words.append(w)

In [None]:
# Words distribution for text without stopwords
frec_dist_filtered = FreqDist(filtered_words)

In [None]:
# Printing the most common words
frec_dist_filtered.most_common(10)

In [None]:
# Plotting a bar char for frecuencies
n = 10

plt.figure(figsize = (15, 8))
plt.barh([ w[0] for w in frec_dist_filtered.most_common(n) ], [ w[1] for w in frec_dist_filtered.most_common(n) ])
plt.gca().invert_yaxis()

In [None]:
# Creating the wordcloud
# Word size is directly related to frecuency
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = "white").generate(text)

plt.figure(figsize = (10, 10))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
# Creating the wordcloud
# Word size is directly related to frecuency
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = "white").generate(" ".join(filtered_words))

plt.figure(figsize = (10, 10))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
# Part of Speech (PoS)
# The meaning of each tag can be found here: https://www.guru99.com/pos-tagging-chunking-nltk.html
pos_tags = nltk.pos_tag(filtered_words)

In [None]:
# Sorting tags
pos_tags.sort(key = lambda tup: tup[1])
pos_tags

In [None]:
# Named Entity Recognition (NER)
document = nlp(text)
displacy.render(document, style = "ent")