# Crawler del sitio Reclamos.cl

In [8]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from sklearn.utils import shuffle

In [9]:
def get_soup(page):
    url = f"https://www.elperiodista.cl/page/{page}/"
    req = requests.get(url)
    soup = BeautifulSoup(req.text)
    return soup

In [10]:
def get_claims_urls(s):
    content_column = s.find("div", {"class" : "content-column"})
    
    content_column_posts = content_column.find_all("a", {"class" : "post-url post-title"})
    posts_urls = list()
    for p in content_column_posts:
        posts_urls.append(p.get("href"))
    return posts_urls
def get_reqs(s):
    return [requests.get(url) for url in get_claims_urls(s)]

In [11]:
def crawl_req(req, verbose = False):
    s = BeautifulSoup(req.text)
    headline = s.find("span",{"class":"post-title"}).text.strip().replace("\n","")
    lead = s.find("h2",{"class":"post-subtitle"}).text.strip().replace("\n","")
    news_type = s.find_all("li", {"class":"bf-breadcrumb-item"})[1].find("a").text.strip().replace("\n","")
    datetime = s.find("time",{"class":"post-published updated"}).get("datetime").strip().replace("\n","")
    body = s.find("div",{"class":"entry-content"}).text.replace("Continuar leyendo", "").strip().replace("\n","")
    
    if verbose == True:
        print("Headline:",headline)
        print("Lead:",lead)
        print("News type:", news_type)
        print("Datetime:",datetime)
        print("Body",body)
    
    return {
        "headline" : headline,
        "lead": lead,
        "news_type": news_type,
        "datetime": datetime,
        "body": body
    }
    

In [15]:
import time
def crawl_reclamos(start_page, end_page, filename = 'noticias.csv', step=1):
    list_dic = []
    for i in range(start_page, end_page + 1,step):
        s = get_soup(i)
        time.sleep(0.01)
        reqs = False
        try:
            reqs = get_reqs(s)
        except:
            pass
        
        if reqs:
            for req in reqs:
                try:
                    list_dic.append(crawl_req(req, False))
                except Exception as e:
                    pass
    df = pd.DataFrame(list_dic)
    df.to_csv(filename)
    return df

In [17]:
%%time
df = crawl_reclamos(2,100)

CPU times: user 1min 14s, sys: 1.25 s, total: 1min 15s
Wall time: 17min 52s


# Threading scrapper


In [14]:
import multiprocessing
import threading
print("Number of maximum threads:", multiprocessing.cpu_count())

MUTEX = threading.Lock()
FILENAME = "threading_noticias.csv"

def append_crawling_req(req):
    df = crawl_req(req)
    df.to_csv(FILENAME, mode='a', header=False)

def crawl_with_threading(start_page, end_page, filename = 'noticias.csv', step=1):
    pass

Number of maximum threads: 8
