Created on Monday 04 January 2021  

**Group 2 - Recherche de nouvelles sources**  
**Scraping de site**

@authors : Maël Lesavourey

# Librairies

In [None]:
!pip install beautifulsoup4
!pip install requests

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import datetime

# Scrapping with BeautifulSoup

In [None]:
url: str = 'https://grh-multi.net/fr/2016/05/compte-rendu-de-levenement-big-data-gpec/'

In [None]:
def get_article_info(soup) -> dict:
    """Documentation
    This function extracts the article information from an html page.
    Parameter:
        soup: A BeautifulSoup object containing the html page.
    Out:
        infos: dictionary containing article information
    """
    infos: dict = {}
    infos['art_content'] = soup.find('div', {'id':'content'}).get_text()
    infos['art_content_html'] = soup.find('div', {'id':'content'})
    infos['art_published_datetime'] = datetime.datetime.strptime(
        soup.find("meta", {'property':'og:updated_time'})['content'],
        '%Y-%m-%dT%H:%M:%S%z').date()
    infos['art_lang'] = soup.html.attrs['lang']
    infos['art_title'] = soup.title.name
    infos['art_url'] = soup.find('link', {'rel':'canonical'})['href']
    infos['art_img'] = soup.find('div', {'id':'content'}).find('img')['src']
    infos['art_auth'] = np.NaN
    infos['art_tag'] = np.NaN
    return infos

def get_src_info(soup) -> dict:
    """Documentation
    This function extracts the source information from an html page.
    Parameters:
        soup: A BeautifulSoup object containing the html page.
    Out:
        infos: dictionary containing source information
    """
    infos: dict = {}
    infos['src_name'] = soup.find('meta', {'property':'og:site_name'})['content']
    infos['src_type'] = 'xpath_source'
    infos['src_url'] = soup.find('a', {'rel':'home'})['href']
    return infos

def all_infos(url: str) -> dict:
    """Documentation
    This function creates a BeautifulSoup object from an url.
    Then it calls the previous functions to get article and source information.
    Parameters:
        url: String which is the url of the article.
    Out:
        info: dictionary containing source and article information.
    """
    #Create Soup objet containing html page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    #Build and return infos dict with previous functions
    info = get_article_info(soup)
    info.update(get_src_info(soup))
    return info