In [47]:
import scrapy
import requests
from bs4 import BeautifulSoup
import datetime
from textblob import TextBlob
import pandas as pd
import json
import numpy as np

In [2]:
# [art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth]

In [45]:
class BigScraper:
    cols = ['art_content', 'art_content_html', 'art_extract_datetime', 'art_lang', 'art_title', 'art_url', 'src_name', 'src_type', 'src_url', 'src_img', 'art_auth']
    
    def __init__(self):
        self.df = pd.DataFrame(columns = BigScraper.cols)
        
    def add_row(self, row_scrap):
        if type(row_scrap) == list:
            self.df.loc[len(self.df)] = row_scrap
        elif type(row_scrap) == dict:
            self.df = self.df.append(row_scrap, ignore_index = True)
        
    def scrap_changethework(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_html = soup.find('div', {'style': 'text-align: justify;'})
        art_content = art_html.get_text().replace('\n', ' ').strip()
        if soup.find('meta', {'property': 'article:modified_time'})['content'] == None:
            if soup.find('meta', {'property': 'article:modified_time'})['content'] == None:
                art_extract_datetime = datetime.datetime.today()
            else:
                art_extract_datetime = soup.find('meta', {'property': 'article:published_time'})['content']
        else:
            art_extract_datetime = soup.find('meta', {'property': 'article:modified_time'})['content']
        art_lang = TextBlob(art_content).detect_language()
        art_title = soup.find('meta', {'property': 'og:title'})['content']
        art_url = soup.find('meta', {'property': 'og:url'})['content']
        src_name = soup.find('meta', {'property': 'og:site_name'})['content']
        src_type = 'xpath_source'
        src_url = 'https://changethework.com/'
        src_img = soup.find('meta', {'property': 'og:image'})['content']
        art_auth = [el.get_text().strip() for el in soup.find_all(
            'span', class_='elementor-post-author')]
        art_tag = np.nan
        return [art_content, art_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def scrap_fncrr(url):
        '''Documentation
        Parameters:
            url: url of the scraped page
        Out:
            row: dict of values
        '''
        response = requests.get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        
        # content, content_html
        try:
            content = html_soup.find("div", {'class': "contenu_c"})
            content_html = content
            content = content.text
        except:
            content_html = np.nan
            content = np.nan
        
        # date
        if html_soup.find("time", {'class': "updated"}) != None:
            date = html_soup.find("time", {'class': "updated"})
        else:
            date = html_soup.find("time", {'class': 'entry-date published'})
        try:
            date = date['datetime']
        except:
            # if no date is specified, put scraping date
            date = datetime.datetime.today()
        
        # tag, title
        presentation = html_soup.find("div", {'class': "prensentation"})
        tag = np.nan  # tags are not always interesting
        title = presentation.find("h1")
        title = title.text
        # Remplissage du dataframe
        row = {'art_content': content,
                   'art_content_html': content_html,
                   'art_extract_datetime': date,
                   'art_lang': 'fr',
                   'art_title': title,
                   'art_url': url,
                   'src_name': 'fnccr',
                   'src_type': 'xpath_source',
                   'src_url': 'https://www.fnccr.asso.fr/',
                   'src_img': np.nan,  # No images
                   'art_auth': np.nan,  # No author specified
                   'art_tag': tag}
        return row
    
    def scrap_sabbar(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Récupération du contenu de la page web (les paragraphes, avec et sans les balises html)
        art_content_html = soup.find('div', class_="entry-content")
        art_content = art_content_html.get_text().replace("\n", " ").replace("\xa0", "").strip()

        # Extraction de la date de l'article
        art_extract_datetime = json.loads(soup.find('script', class_ = 'yoast-schema-graph yoast-schema-graph--main').get_text())['@graph'][1]['dateModified']

        # Langue de l'article
        art_lang = soup.find('meta', property = "og:locale").get('content')

        # Titre
        art_title = soup.find('meta', property = "og:title").get('content')

        # Url
        art_url = soup.find('link', rel = 'canonical').get('href')

        # Nom de la source
        src_name = "Sabbar"

        # Type de la source
        src_type = "xpath_source"

        # url source
        src_url = soup.find('meta', property = 'og:url').get('content')

        # Image(s)
        src_img = np.nan

        # Auteur de l'article
        art_auth = np.nan

        # Tag de l'auteur
        art_tag = np.nan

        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def assign_scraper(url):
        if 'https://changethework.com/' in url:
            return BigScraper.scrap_changethework(url)
        elif 'https://www.fnccr.asso.fr/article/' in url:
            return BigScraper.scrap_fncrr(url)
        elif 'http://sabbar.fr/' in url:
            return BigScraper.scrap_sabbar(url)
        return None

    def scrap(self, url):
        row = BigScraper.assign_scraper(url)
        self.add_row(row)
        return row

In [42]:
response = requests.get('http://sabbar.fr/management/le-management-strategique-et-le-management-operationnel/#:~:text=Le%20management%20op%C3%A9rationnel%20correspond%20aux,pour%20atteindre%20les%20objectifs%20fix%C3%A9s.')
soup = BeautifulSoup(response.text, 'html.parser')

'2014-02-02T09:38:43+00:00'

In [10]:
BG = BigScraper()
row = BG.scrap('https://changethework.com/chatbot-rh-recrutement/')

In [11]:
BG.df

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28T07:25:43+00:00,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]"


In [12]:
row = BG.scrap("https://www.fnccr.asso.fr/article/big-data-territorial-publication-de-letude-de-la-fnccr/")

In [13]:
BG.df

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28T07:25:43+00:00,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]",
1,Si les regards se tournent souvent vers les gr...,[[Si les regards se tournent souvent vers les ...,2017-02-20T14:40:36+01:00,fr,"""Big data territorial"" : Publication de l'étud...",https://www.fnccr.asso.fr/article/big-data-ter...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data


In [48]:
row = BG.scrap('http://sabbar.fr/management/le-management-strategique-et-le-management-operationnel/#:~:text=Le%20management%20op%C3%A9rationnel%20correspond%20aux,pour%20atteindre%20les%20objectifs%20fix%C3%A9s.')

In [49]:
BG.df

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28T07:25:43+00:00,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]",
1,Si les regards se tournent souvent vers les gr...,[[Si les regards se tournent souvent vers les ...,2017-02-20T14:40:36+01:00,fr,"""Big data territorial"" : Publication de l'étud...",https://www.fnccr.asso.fr/article/big-data-ter...,fnccr,xpath_source,https://www.fnccr.asso.fr/,no_data,no_data,no_data
2,Qu'est-ce que le management ? Le management ...,"[\n, [\n, [ ], \n], \n, [\n, [<span style=""fon...",2014-02-02T09:38:43+00:00,fr_FR,Le management stratégique et le management opé...,http://sabbar.fr/management/le-management-stra...,Sabbar,xpath_source,http://sabbar.fr/management/le-management-stra...,,,
