In [None]:
# Instalo bibliotecas
!pip install bs4 --quiet

In [None]:
# Importo
import pandas as pd
import re 
import gc
import requests
import time
import sys
import warnings
import pprint
import concurrent.futures

from bs4 import BeautifulSoup
from sklearn.externals import joblib

# PrettyPrinter
pp = pprint.PrettyPrinter(compact=True)
pp = pprint.PrettyPrinter(indent=4, compact=True)

# Libero memoria
gc.collect()

# No mostrar warnings
warnings.filterwarnings('ignore')

# Limite de recursion
sys.setrecursionlimit(30000)

In [None]:
# Inicializacion de constantes

# Defino cantidad de topicos y paginas
topics = ['economia', 'el-mundo', 'sociedad']
pages = range(1, 750, 5)

# Defino headers del request
request_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
}

In [None]:
# Creo funcion que retorna las noticias de un topico determinado
RegExp = re.compile('\\n*')

# Defino pagina base
base_page = 'https://www.pagina12.com.ar'

def retrieve_topic_news(topic):
  topic_news = []
  for page in pages:
    request = requests.get(f'{base_page}/secciones/{topic}?page={page}', headers=request_headers, timeout=None)
    soup = BeautifulSoup(request.text, 'html.parser')
    articles = soup.find_all('article', class_='article-item article-item--teaser ')
    for article in articles:
      anchor = article.find('a', class_='p12-separator--left--primary')
      if anchor is not None:
        article_detail = requests.get(f'{base_page}/{anchor["href"]}', headers=request_headers, timeout=None)
        soup_detail = BeautifulSoup(article_detail.text, 'html.parser')
        paragraphs_div = soup_detail.find('div', class_='article-main-content article-text ')
        dates = soup_detail.find('div',class_='article-info')
        if paragraphs_div is not None and dates is not None:
          dates = dates.find('span').string
          paragraphs = paragraphs_div.find_all('p')
          news = ''
          for paragraph in paragraphs:
            if paragraph.string is not None:
              news += paragraph.string
          if news != '' and not RegExp.fullmatch(news):
            topic_news.append({'url':f'{base_page}/{anchor["href"]}','paragraph':news,'date':dates,'topic': topic})
    time.sleep(3)
  return pd.DataFrame(topic_news)

In [None]:
with concurrent.futures.ThreadPoolExecutor() as executor:
  topics_results = executor.map(retrieve_topic_news, topics)
  topics_news = pd.DataFrame()
  for topic_result in topics_results:
    topics_news = pd.concat([topics_news, topic_result])

In [None]:
del topics_results
gc.collect()

In [None]:
# Exportamos informacion adquirida del web scraping
joblib.dump(topics_news, 'news.sav')  