In [15]:
!pip install wordcloud
!pip install spacy
!python -m spacy download es_core_news_sm

from IPython.display import clear_output
clear_output()
print('WordCloud installed.')
print('Spacy es_core_news_model installed.\nRestart the runtime!')

WordCloud installed.
Spacy es_core_news_model installed.
Restart the runtime!


Go to Restart runtime...

In [1]:
import time
import json
import requests


CUBADEBATE_API = "http://www.cubadebate.cu/wp-json/wp/v2/"
COMMENTS_ENDPOINT = CUBADEBATE_API + "comments/"

session = requests.Session()

def get_comments_json(page=1):
    params = {"page": str(page)}
    
    with session:
        time.sleep(1)
        response = session.get(COMMENTS_ENDPOINT,
                               params=params)
        if response.status_code == 200:
            return response.json()
        
comments_list = get_comments_json()
print(len(comments_list))
#comments_list

10


In [2]:
from concurrent.futures import ThreadPoolExecutor

NUM_PAGES=100

with ThreadPoolExecutor() as executor:
    results = executor.map(get_comments_json, range(1,NUM_PAGES+1))
    comments_list = list(results)
    
documents = [comment.get('content').get('rendered')
          for comments in comments_list
          for comment in comments if not comment is None]
        
print(len(documents))
#documents

1000


In [3]:
import re
import spacy

# Load model
nlp = spacy.load('es_core_news_sm')

def get_text(markup):
    """Remove html tags and spaces using regexp"""
    text = re.sub(r'<.*?>', '', markup)
    text = re.sub('\s+', ' ', text)
    return text.strip()

def clean(doc):
    """Remove stopwords, the punctuations and normalize the corpus."""

    def is_token_allowed(token):
      if (not token or not token.string.strip() or
          token.is_stop or token.is_punct):
        return False
      return True

    def preprocess_token(token):
      return token.lemma_.strip().lower()

    text = get_text(doc)
    tokens = [preprocess_token(word) for word in nlp(text)
              if is_token_allowed(word) and len(word) > 2]
    normalized = " ".join(word for word in tokens)
    return normalized
    

documents_normalized = [clean(doc).split() for doc in documents]
#documents_normalized
print('Documents normalized')


Documents normalized


In [4]:
import math
from collections import Counter


def comment_tf_dict(document):
    bow = Counter(document)
    tf_dict = dict()
    
    for k, v in bow.most_common():
        tf_dict[k] = v / len(document)
    
    return tf_dict

def comments_tf_dict(documents):
    return [comment_tf_dict(comment)
            for comment in documents]

def count_dict(documents):
    counts = dict()
    for document in documents:
        uniq_words = set(document)
        for word in uniq_words:
            value = counts.get(word, 0)
            counts[word] = value + 1
    return counts

def idf_dict(documents):
    idf_dict = dict()
    
    counts = count_dict(documents)
    
    for word in counts:
        idf_dict[word] = math.log(len(documents) / counts[word])
    
    return idf_dict

def comments_tfidf_dict(documents):
    tfidf_comments = []
    
    idf_comments = idf_dict(documents)
    
    def compute_tfidf_comment(document):
        tfidf_comment = dict()
        
        tf_comment = comment_tf_dict(document)
        
        for word in set(document):
            tfidf_comment[word] = tf_comment[word] * idf_comments[word]
        
        return tfidf_comment
    
    for comment in documents:
        tfidf_comments.append(compute_tfidf_comment(comment))
    
    
    return tfidf_comments


tfidf_list = comments_tfidf_dict(documents_normalized)
#tfidf_list
print('TF-IDF comments list')

TF-IDF comments list


In [5]:
from collections import OrderedDict

ordered_tfidf = dict()

for tfidf in tfidf_list:
    for word, value in tfidf.items():
        if word in ordered_tfidf:
            ordered_tfidf[word] += value
        else:
            ordered_tfidf[word] = value

#ordered_tfidf
ordered_tfidf = OrderedDict(sorted(ordered_tfidf.items(), key=lambda x: x[1], reverse=True))

for word in ordered_tfidf.keys():
    print(word, end=' ')

cuba gracias caso dios cubano poblar país personar casar totalmente salud gracia ley medir salir información ingresar favor médico positivo entender deber mundo poner doctor tomar seguir confirmar bendecir covid-19 paciente unir contagiar china cuidar contacto fidel pandemia durán morir gobernar pasar valiente año ayudar ver municipio excelente merecer pesar respetar virus orgulloso historia necesitar recuperar estar esperar informar viva situación fuerza provincia cuidense joven felicidades vivir solidaridad tener vida donación jajaja aplauso momento paso cumplir pensar probar gente alguien apoyar yusi hombre poder leer tiempo evacuar nasobuco casa familia alto precioso personal enfermedad negativo población callar hacer grande pueblo falto hablar caer necesario habana presidente humanidad comentario querer cola cuarentena formar niño rápido isla duro sancionar jugar dudar gestar llegar dar preguntar tarea riesgo contactar aplicar prueba publicar síntoma labor saludos volver edad foto

In [6]:
from IPython.display import clear_output

%env IMG_CAPITOLIO=capitolio.jpg


# download mask images
#!wget http://media.cubadebate.cu/wp-content/gallery/la-habana-nocturna/app_la-habana_05.jpg -O la_hababa.jpg
!wget https://upload.wikimedia.org/wikipedia/commons/8/8f/Capitolio_full.jpg -O $IMG_CAPITOLIO

clear_output()
print('Image downloaded.')

Image downloaded.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os
from PIL import Image
from wordcloud import WordCloud

# Capitolio Habana
IMG_CAPITOLIO = os.getenv("IMG_CAPITOLIO") or "capitolio.jpg"
IMG_WORDCLOUD = 'wordcloud_cubadebate.png'
# Spacy Stop_Words
STOP_WORDS = spacy.lang.es.stop_words.STOP_WORDS

# get data directory (using getcwd() is needed to support running example in generated IPython notebook)
_dir = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()


# read the mask image
# taken from
_mask = np.array(Image.open(os.path.join(_dir, IMG_CAPITOLIO)))

# Generate Word Cloud
wordcloud = WordCloud(
    max_words=1000,
#     max_font_size=50,
    height=1440,
    width=2160,
    background_color = 'white',
    mask=_mask,
    contour_width=1,
    contour_color='steelblue',
    stopwords = STOP_WORDS).generate_from_frequencies(ordered_tfidf)
# Save to file
wordcloud.to_file(IMG_WORDCLOUD)
fig = plt.figure(
    figsize = (22, 15),
    )
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [8]:
import base64

data = ''
with open(IMG_WORDCLOUD, 'rb') as file_img:
    data = base64.b64encode(file_img.read()).decode('utf-8')

img_str = '''
<image width="100%" height="100%" 
src="data:image/png;base64,{}" />
'''.format(data)

with open('wordcloud_cubadebate.html', 'w') as _html:
    _html.write(img_str)

print('Exported image to html.')

Exported image to html.


In [9]:
!ls

capitolio.jpg  sample_data  wordcloud_cubadebate.html  wordcloud_cubadebate.png
