In [None]:
# Instalo bibliotecas
!pip install bs4 --quiet

In [None]:
# Importo
import pandas as pd
import pickle 
import re 
import requests
import time
import sys
import warnings
import pprint
import concurrent.futures
import snowballstemmer

from google.colab import files
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer

# PrettyPrinter
pp = pprint.PrettyPrinter(compact=True)
pp = pprint.PrettyPrinter(indent=4, compact=True)

# No mostrar warnings
warnings.filterwarnings('ignore')

In [None]:
# Inicializacion de constantes

# Defino cantidad de topicos y paginas
topics = ['economia', 'el-mundo', 'sociedad']
pages = range(1, 1000)

# Defino headers del request
request_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

In [None]:
# Creo funcion que retorna las noticias de un topico determinado
RegExp = re.compile('\\n*')

def retrieve_topic_news(topic):
  topic_news = []
  for page in pages:
    request = requests.get(f'https://www.pagina12.com.ar/secciones/{topic}?page={page}', headers=request_headers)
    soup = BeautifulSoup(request.text, 'html.parser')
    articles = soup.find_all('article', class_='article-item article-item--teaser ')
    for article in articles:
      anchor = article.find('a', class_='p12-separator--left--primary')
      if anchor is not None:
        article_detail = requests.get(f'https://www.pagina12.com.ar/{anchor["href"]}', headers=request_headers)
        soup_detail = BeautifulSoup(article_detail.text, 'html.parser')
        paragraphs_div = soup_detail.find('div', class_='article-main-content article-text ')
        if paragraphs_div is not None:
          dates = soup_detail.find('div',class_='article-info')
          if dates is not None:
            dates = dates.find('span').string
            paragraphs = paragraphs_div.find_all('p')
            news = ''
            for paragraph in paragraphs:
              if paragraph.string is not None:
                news += paragraph.string
            if news != '' and not RegExp.fullmatch(news):
              topic_news.append({'url':f'https://www.pagina12.com.ar/{anchor["href"]}','paragraph':news,'date':dates,'topic': topic})
    time.sleep(3)
  return pd.DataFrame(topic_news)

In [None]:
with concurrent.futures.ThreadPoolExecutor() as executor:
  topics_results = executor.map(retrieve_topic_news, topics)
  topics_news = pd.DataFrame()
  for topic_result in topics_results:
    topics_news = pd.concat([topics_news, topic_result])

In [None]:
sys.setrecursionlimit(10000)

with open('TP1.pickle', 'wb') as handle:
    pickle.dump(topics_news, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
files.download('TP1.pickle')

In [None]:
with open('TP1.pickle', 'rb') as handle:
    print(pickle.load(handle))

In [None]:
# Stopwords
stopwords_es = pd.read_csv('/content/stopwords_es.txt', header = None)
stopwords_es_sin_acentos = pd.read_csv('/content/stopwords_es_sin_acentos.txt', header = None)

stopwords = pd.concat([stopwords_es, stopwords_es_sin_acentos])

In [None]:
def remove_stop_words(text):
  """
    Remueve stop words en inglés

    Attributes
    ----------
    text: list
      lista de palabras (tokens) a filtrar

    Returns
    -------
    list
      lista de palabras sin los stop words
  """
  return [token for token in text if token.lower() not in stopwords]

In [None]:
def tokenize(text): 
  """
  :param text: Una expresion regular que define que es un token
  :return: Una funcion que recibe un texto y retorna el texto tokenizado.
  """
  if text is None:
    text = r"[a-zA-ZâáàãõáêéíóôõúüÁÉÍÓÚñÑçÇ][0-9a-zA-ZâáàãõáêéíóôõúüÁÉÍÓÚñÑçÇ]+"
  token_pattern = re.compile(text)
  return lambda doc: token_pattern.findall(doc)

In [None]:
stemmer = snowballstemmer.stemmer("spanish")

def stem_words(tokens):
    """
    Transforma mediante un stemmer a una secuencia de tokens.
    :param tokens: Una secuencia de tokens.
    :return La secuencia de tokens transformada por el stemmer.
    """
    global stemmer
    return [stemmer.stem(word) for word in tokens]

In [None]:
def clean_short_words(text):
  """
    Limpia palabras con longitud 1

    Attributes
    ----------
    text: str
      documento a tokenizar
    
    Returns
    -------
    list
      lista de tokens
  """
  return [word for word in text if len(word) > 1]

In [None]:
def preprocess_text(text):
  """
    Pre-procesamiento

    Attributes
    ----------
    text: str
      documento a analizar

    Returns
    -------
    pd.DataFrame
      retorna un dataframe con las 20 palabras que más se repiten y su frecuencia
  """
  tokenized = tokenize(text)
  without_stops = remove_stop_words(tokenized)
  without_short_words = clean_short_words(without_stops)
  stemmed_words = stem_words(without_short_words)
  return stemmed_words

In [None]:
# cantidad minima de docs que tienen que tener a un token para conservarlo.
MIN_DF=3
# cantidad maxima de docs que tienen que tener a un token para conservarlo.
MAX_DF=0.8
# numero minimo tokens consecutivos que se consideran
MIN_NGRAMS=1
# numero maximo tokens consecutivos que se consideran
MAX_NGRAMS=2

# aplicamos count vectorizer
vectorizer = CountVectorizer(tokenizer=preprocess_text,
                                 lowercase=True, strip_accents='unicode', decode_error='ignore',
                                 ngram_range=(MIN_NGRAMS, MAX_NGRAMS), min_df=MIN_DF, max_df=MAX_DF)

#vectorizer.fit_transform(DATASETFALOPA)