# Obtaining all the articles and links

In [30]:
import requests
import pandas as pd
from datetime import date
from bs4 import BeautifulSoup

WEBSITE = {
    'diario_es' : {
        'url' : 'https://diarioelsalvador.com/',
        'attrs' : {
            'class_sections' : 'jeg_menu',
            'class_articles' : 'jeg_posts',
            'class_article_title' : 'jeg_post_title',
            'class_subtitle_article' : 'jeg_post_subtitle',
            'class_date_article' : 'jeg_meta_date',
            'class_author_article' : 'jeg_meta_author',
            'class_body_article' : 'content-inner',            
        }
    },
}

def main(newspaper):
    """This method obtains all the articles of each section. 
    
    Args:
        newspaper: the id of the newspaper to scrap.
    """
    
    try:
        diario_es = requests.get(WEBSITE[newspaper]['url'])
        
        if diario_es.status_code == 200:
            soup_diario_es = BeautifulSoup(diario_es.text, 'lxml');
            sections = _get_sections(soup_diario_es, newspaper)
            articles = _get_articles(sections, newspaper)
            _save_articles(articles, newspaper)
            print(f'The Scrapping of {newspaper}is finished.')
            
        else:
            raise ValueError(f'Error: Status {diario_es.status_code}')
            
    except ValueError as ve:
        print(ve)
        
    
def _get_sections(soup_newspaper, newspaper):
    """This method takes the soup of the main page of the website 
    and returns a list of title and url of each section.
    
    Args: 
        soup_newspaper: BeautifulSoup object of the main page of the
            website.
            
        newspaper: the id of the newspaper to scrap.
    
    Returns:
        list_sections: list of the title and url of each section in the 
            main page of the website.
    """
    print('Getting the sections of the website')
    attrs = {'class' : WEBSITE[newspaper]['attrs']['class_sections']}
    sections = soup_newspaper.find('ul', attrs=attrs).find_all('li')
    
    list_sections = []
    
    if sections:
        for section in sections:
            url_section = section.find('a').get('href')
            title_section = section.find('a').get_text()
            section = [title_section, url_section]
            list_sections.append(section)
    
    return list_sections


def _get_articles(sections, newspaper):
    """This method
    
    Args:
        sections: list of the title and url of each section in the 
            main page of the website.
        
        newspaper: the id of the newspaper to scrap.
    """
    print('Getting the articles of each section')
    attrs = {'class' : WEBSITE[newspaper]['attrs']['class_articles']}
    
    all_articles =[]
    for section in sections:
        try:
            section_page = requests.get(section[1])
            
            if section_page.status_code == 200:
                soup_section_page = BeautifulSoup(section_page.text, 'lxml')
                list_articles = soup_section_page.find('div', attrs=attrs)
                if list_articles:
                    articles = _get_list_articles(list_articles, section[0])
                    all_articles += articles
            else:
                raise ValueError(f'Error: Status {section_page.status_code} for section {section[0]}')
                
        except ValueError as ve:
            print(ve)
    
    return all_articles

def _get_list_articles(list_articles, section_title):
    """This method returns the link and title for every article
    
    Args: 
        list_articles: list of soup of every article.
        section_title: title of the section of the article
        
    Returns:
        articles: list of the articles with title, url and section.
    """
    print(f'\tGetting the list of articles of section {section_title}')
    attrs = {'class' : WEBSITE[newspaper]['attrs']['class_article_title']}
    
    articles = []
    for article in list_articles:
        url_article = article.find('h3', attrs=attrs).find('a').get('href')
        content_article = _get_content_article(url_article, section_title)
        articles.append(content_article)
    
    return articles

def _get_content_article(url_article, section_title):
    """This method obtains the information of the article
    
    Args: 
        url_article: The url of the article to scrap.
        section_title: Name of the section to add to the list
    
    Returns:
        content_article: List that contains all the content of the page
            of the article
    """
    try:
        article = requests.get(url_article)
        if article.status_code == 200:
            soup_article = BeautifulSoup(article.text, 'lxml')
            
            title = soup_article.find('h1', attrs={'class' : 'jeg_post_title'})
            title_article = [title.get_text() if title else []]
                
            subtitle = soup_article.find('h2', attrs={'class' : 'jeg_post_subtitle'})
            subtitle_article = [subtitle.get_text() if subtitle else []]
            
            date = soup_article.find('div', attrs={'class' : 'jeg_meta_date'})
            date_article = [date.get_text() if date else []]
            
            author = soup_article.find('div', attrs={'class' : 'jeg_meta_author'})
            author_article = [author.get_text() if author else []]
            
            body = soup_article.find('div', attrs={'class' : 'content-inner'})
            body_article = [body.get_text() if body else []]
            
            content_article = [title_article, subtitle_article, date_article, author_article, body_article, section_title, url_article]
            return content_article
        else:
            raise ValueError(f'Error: Status {article.status_code}')
    except ValueError as ve:
        print(ve)

def _save_articles(articles, newspaper):
    """This method creates a dataframe with all the information received 
    and saves the information scraped into a csv file with the name
    of the newspaper and the date of the scrapping.
    
    Args:
        articles: List of the information scrapped from the website.
        newspaper: name of the website scrapped.
    """
    print('Saving the information.')
    today = date.today().strftime('%d-%m-%Y')
    filename = f'{newspaper}_{today}.csv'
    
    columns_articles = [
        'title_article',
        'subtitle_article',
        'date_article',
        'author_article',
        'body_article',
        'section_title',
        'url_article'
    ]
    
    df_articles = pd.DataFrame(articles, columns=columns_articles)
    df_articles.to_csv(filename)

In [31]:
if __name__ == "__main__":
    newspaper = 'diario_es'
    main(newspaper)

Getting the sections of the website
Getting the articles of each section
	Getting the list of articles of section DePaís
	Getting the list of articles of section DePalabra
	Getting the list of articles of section DeDinero
	Getting the list of articles of section DeComercio
	Getting the list of articles of section DePlaneta
	Getting the list of articles of section DeInnovación
	Getting the list of articles of section DeCultura
	Getting the list of articles of section DeVida
	Getting the list of articles of section DeDiversión
	Getting the list of articles of section DeDeportes
Saving the information.
The Scrapping of diario_esis finished.
