# NLP basic tutorial (spanish)

In [None]:
!python -m spacy download es_core_news_md

In [None]:
import os

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tag import StanfordPOSTagger

from wordcloud import WordCloud

import spacy
from spacy import displacy
import es_core_news_md

from pymongo import MongoClient

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
jar = "./pos-tagger/stanford-postagger/stanford-postagger-4.2.0.jar"
model = "./pos-tagger/stanford-postagger/models/spanish-ud.tagger"
os.environ["JAVAHOME"] = "/usr/bin/java"

In [None]:
nlp = es_core_news_md.load()

In [None]:
# Creating a connection to MongoDB
client = MongoClient("146.148.63.14", 27017)
db = client["news"]
collection = db["elespectador"]

In [None]:
text = []
for news in list(collection.find({}, {"title": 1, "summary": 1, "full_text": 1, "_id": 0})):
    text.append(news["title"])
    text.append(news["summary"])
    text.append(news["full_text"])

In [None]:
text = " ".join(text)

In [None]:
text

In [None]:
# Word tokenization
words = word_tokenize(text)

In [None]:
words[:10]

In [None]:
# Words distribution
frec_dist = FreqDist(words)

In [None]:
# Printing the most common words
frec_dist.most_common(10)

In [None]:
# Stop words
# Stop words are basically a set of commonly used words in any language, not just English.
# The reason why stop words are critical to many applications is that, if we remove the words that are very commonly used in a given language, we can focus on the important words instead.

stop_words = set(stopwords.words("spanish"))

In [None]:
stop_words

In [None]:
# Removing stop words
filtered_words = []

for w in words:
    if w.lower() not in stop_words:
        filtered_words.append(w)

In [None]:
# Words distribution for text without stopwords
frec_dist_filtered = FreqDist(filtered_words)

In [None]:
# Printing the most common words
frec_dist_filtered.most_common(10)

In [None]:
# Plotting a bar char for frecuencies
n = 10

plt.figure(figsize = (15, 8))
plt.barh([ w[0] for w in frec_dist_filtered.most_common(n) ], [ w[1] for w in frec_dist_filtered.most_common(n) ])
plt.gca().invert_yaxis()

In [None]:
# Creating the wordcloud
# Word size is directly related to frecuency
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = "white").generate(text)

plt.figure(figsize = (10, 10))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
# Creating the wordcloud without stopwords
# Word size is directly related to frecuency
wordcloud = WordCloud(max_font_size = 50, max_words = 100, background_color = "white").generate(" ".join(filtered_words))

plt.figure(figsize = (10, 10))
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
# Part of Speech (PoS)
# Model optimized for spanish: https://nlp.stanford.edu/software/
pos_tagger = StanfordPOSTagger(model, jar, encoding = "utf8")
pos_tags = pos_tagger.tag(filtered_words)

In [None]:
pos_tags.sort(key = lambda tup: tup[1])
pos_tags

In [None]:
len(pos_tags)

In [None]:
pos_tags[-100:]

In [None]:
# Named Entity Recognition (NER)
document = nlp(text)
displacy.render(document, style = "ent")