In [1]:
import scrapy
import requests
from bs4 import BeautifulSoup
import datetime
from textblob import TextBlob
import pandas as pd
import json
import numpy as np

In [2]:
# [art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth]

In [16]:
class BigScraper:
    cols = ['art_content', 'art_content_html', 'art_extract_datetime', 'art_lang', 'art_title', 'art_url', 'src_name', 'src_type', 'src_url', 'src_img', 'art_auth', 'art_tag']
    
    def __init__(self):
        self.df = pd.DataFrame(columns = BigScraper.cols)
        
    def add_row(self, row_scrap):
        if type(row_scrap) == list:
            self.df.loc[len(self.df)] = row_scrap
        elif type(row_scrap) == dict:
            self.df = self.df.append(row_scrap, ignore_index = True)
    
    def scrap_changethework(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_html = soup.find('div', {'style': 'text-align: justify;'})
        art_content = art_html.get_text().strip()
        if soup.find('meta', {'property': 'article:modified_time'})['content'] == None:
            if soup.find('meta', {'property': 'article:modified_time'})['content'] == None:
                art_extract_datetime = datetime.datetime.today()
            else:
                art_extract_datetime = soup.find('meta', {'property': 'article:published_time'})['content']
        else:
            art_extract_datetime = soup.find('meta', {'property': 'article:modified_time'})['content']
        art_lang = TextBlob(art_content).detect_language()
        art_title = soup.find('meta', {'property': 'og:title'})['content']
        art_url = soup.find('meta', {'property': 'og:url'})['content']
        src_name = soup.find('meta', {'property': 'og:site_name'})['content']
        src_type = 'xpath_source'
        src_url = 'https://changethework.com/'
        src_img = soup.find('meta', {'property': 'og:image'})['content']
        art_auth = [el.get_text().strip() for el in soup.find_all(
            'span', class_='elementor-post-author')]
        art_tag = np.nan
        return [art_content, art_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def scrap_fncrr(url):
        '''Documentation
        Parameters:
            url: url of the scraped page
        Out:
            row: dict of values
        '''
        response = requests.get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        # content, content_html
        try:
            content = html_soup.find("div", {'class': "contenu_c"})
            content_html = content
            content = content.text
        except:
            content_html = np.nan
            content = np.nan
        # date
        if html_soup.find("time", {'class': "updated"}) != None:
            date = html_soup.find("time", {'class': "updated"})
        else:
            date = html_soup.find("time", {'class': 'entry-date published'})
        try:
            date = date['datetime']
        except:
            # if no date is specified, put scraping date
            date = datetime.datetime.today()
        # tag, title
        presentation = html_soup.find("div", {'class': "prensentation"})
        tag = np.nan  # tags are not always interesting
        title = presentation.find("h1")
        title = title.text
        # Remplissage du dataframe
        row = {'art_content': content,
                   'art_content_html': content_html,
                   'art_extract_datetime': date,
                   'art_lang': 'fr',
                   'art_title': title,
                   'art_url': url,
                   'src_name': 'fnccr',
                   'src_type': 'xpath_source',
                   'src_url': 'https://www.fnccr.asso.fr/',
                   'src_img': np.nan,  # No images
                   'art_auth': np.nan,  # No author specified
                   'art_tag': tag}
        return row
    
    def scrap_sabbar(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Récupération du contenu de la page web (les paragraphes, avec et sans les balises html)
        art_content_html = soup.find('div', class_="entry-content")
        art_content = art_content_html.get_text().replace("\xa0", "").strip()
        # Extraction de la date de l'article
        art_extract_datetime = json.loads(soup.find('script', class_ = 'yoast-schema-graph yoast-schema-graph--main').get_text())['@graph'][1]['dateModified']
        # Langue de l'article
        art_lang = soup.find('meta', property = "og:locale").get('content')
        # Titre
        art_title = soup.find('meta', property = "og:title").get('content')
        # Url
        art_url = soup.find('link', rel = 'canonical').get('href')
        # Nom de la source
        src_name = "Sabbar"
        # Type de la source
        src_type = "xpath_source"
        # url source
        src_url = soup.find('meta', property = 'og:url').get('content')
        # Image(s)
        src_img = np.nan
        # Auteur de l'article
        art_auth = np.nan
        # Tag de l'auteur
        art_tag = np.nan

        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def scrap_theinnovation(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_content_html = soup.find("div", {"class": "entry-content"})
        art_content = art_content_html.text.replace('\xa0', '')
        if soup.find("meta", {"property": "article:modified_time"}) != None:
            art_extract_datetime = soup.find(
                "meta", {"property": "article:modified_time"})["content"]
        elif soup.find("meta", {"property": "article:published_time"}) != None:
            art_extract_datetime = soup.find(
                "meta", {"property": "article:published_time"})["content"]
        else:
            art_extract_datetime = datetime.datetime.today()
        art_lang = soup.find("meta", {"property": "og:locale"})["content"]
        art_title = soup.find("meta", {"property": "og:title"})["content"]
        art_url = soup.find("meta", {"property": "og:url"})["content"]
        src_name = soup.find("meta", {"property": "og:site_name"})["content"]
        src_type = "xpath_source"  # default value
        src_url = soup.find("a", {"rel": "home"})["href"]
        src_img = soup.find("meta", {"property": "og:image"})["content"]
        art_auth = soup.find("a", {"rel": "author"}).text
        art_tag = soup.find("meta", {"name": "keywords"}).text.split(',')
        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url,
                src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def scrap_lemondeinformatique(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_content_html = soup.find('div', class_ = 'article-body')
        art_content = art_content_html.get_text().replace('\xa0', '').strip()
        if soup.find("meta", {"itemprop":"datePublished"}) is not None:
            art_extract_datetime = soup.find("meta", {"itemprop":"datePublished"})['content']
        else:
            art_extract_datetime = datetime.datetime.today()
        art_langue = TextBlob(get_Title(art_content)).detect_language()
        art_title = soup.find("meta", {"property": "og:title"})["content"]
        art_url = soup.find("meta", {"property": "og:url"})["content"]
        src_name = soup.find("meta", {"property": "og:site_name"})["content"]
        src_type = 'xpath_source'
        src_url = 'https://www.lemondeinformatique.fr/'
        src_img = soup.find("meta", {"property": "og:image"})["content"]
        art_auth = soup.find("div", class_ = "author-infos").find("b", {"itemprop": "name"}).get_text()
        art_tag = [el.get_text() for el in soup.find_all("a", {"rel": "category tag"})]
        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def assign_scraper(url):
        if 'https://changethework.com/' in url:
            return BigScraper.scrap_changethework(url)
        elif 'https://www.fnccr.asso.fr/article/' in url:
            return BigScraper.scrap_fncrr(url)
        elif 'http://sabbar.fr/' in url:
            return BigScraper.scrap_sabbar(url)
        elif 'https://www.theinnovation.eu/' in url:
            return BigScraper.scrap_theinnovation(url)
        elif 'https://www.lemondeinformatique.fr/' in url:
            return BigScraper.scrap_lemondeinformatique(url)
        return None

    def scrap(self, url):
        row = BigScraper.assign_scraper(url)
        self.add_row(row)
        return row

In [7]:
url = "https://www.riskinsight-wavestone.com/2019/08/detecter-incidents-machine-learning/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

<!DOCTYPE doctype html>
<!--[if IE 7]><html class="ie7 oldie no-js"><![endif]-->
<!--[if IE 8]><html class="ie8 oldie no-js"><![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" prefix="og: http://ogp.me/ns#">
 <!--<![endif]-->
 <head>
  <link data-minify="1" href="https://www.riskinsight-wavestone.com/wp-content/cache/min/1/5f26d6b9838aa528c1b80140d0a624ba.css" rel="stylesheet"/>
  <link data-minify="1" href="https://www.riskinsight-wavestone.com/wp-content/cache/min/1/0e71746abeeadc9d718ec538f51d1acb.css" rel="stylesheet"/>
  <script data-minify="1" src="https://www.riskinsight-wavestone.com/wp-content/cache/min/1/cac6f9c2c045ad4e22777e7c4719d59e.js">
  </script>
  <script data-minify="1" src="https://www.riskinsight-wavestone.com/wp-content/cache/min/1/6cc755c7a6b85383a3b2f4926c3cba3b.js">
  </script>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="https:/

In [10]:
soup.find('article')

<article><h1 class="post-page--title page--title">Détecter des incidents cyber par Machine Learning : notre maquette en 5 étapes clefs !</h1><p class="post-page--category"> <a class="category category_3225" href="https://www.riskinsight-wavestone.com/category/rubriques/ethical-hacking-indicent-response/">Ethical Hacking &amp; Incident Response</a></p><div class="tag"><ol class="tag--list reset-ul inbl"><li class="tag--item inbl"><a class="tag--link" href="https://www.riskinsight-wavestone.com/tag/detection/">#détection</a></li><li class="tag--item inbl"><a class="tag--link" href="https://www.riskinsight-wavestone.com/tag/intelligence-artificielle/">#Intelligence Artificielle</a></li><li class="tag--item inbl"><a class="tag--link" href="https://www.riskinsight-wavestone.com/tag/machine-learning/">#Machine learning</a></li><li class="tag--item inbl"><a class="tag--link" href="https://www.riskinsight-wavestone.com/tag/soc/">#SOC</a></li><li class="tag--item inbl"><a class="tag--link" href

In [5]:
def remplir_dataFrame(url):
    """
    This function add different information of the webpage/article to the dataFrame
    """
    response = requests.get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    art_content_html = soup.find('article')
    art_content = art_content_html.get_text()
    
    df.loc[0, 'src_type'] = 'xpath_source'
    df.loc[0, 'src_url'] = 'https://www.riskinsight-wavestone.com/'
    df.loc[0, 'src_name'] = 'RiskInsight'
    df.loc[0, 'art_url'] = url
    df.loc[0, 'art_auth'] = get_Author(url)
    df.loc[0, 'art_extract_datetime'] = get_Time(url)
    title = get_Title(url)
    df.loc[0, 'art_title'] = title
    df.loc[0, 'art_lang'] = get_lang(title)
    tag = get_Tag(url)
    if tag is None or tag==[]:
        tag = 'no data'
    df.loc[0, 'art_tag'] = tag
    df.loc[0, 'art_content_html'] = art_content_html(url)
    df.loc[0, 'src_img'] = get_Img(url)
    return df

"\nLe fournisseur américain de logiciels et de services de ressources humaines\xa0Ceridian\xa0parie sur le cloud hybride, la virtualisation et l'automatisation des réseaux\xa0pour\xa0améliorer la\xa0livraison\xa0de ses services\xa0IT, éliminer les\xa0points de friction\xa0et renforcer la sécurité.\xa0L’entreprise\xa0GRH\xa0a récemment\xa0bouclé\xa0sa transition vers une architecture\xa0cloud,\xa0après avoir fermé\xa0ses\xa0datacenters\xa0sur site et migré\xa0ses applications et ses systèmes de back-office vers plusieurs\xa0clouds.\xa0«\u202fNous sommes un véritable consommateur de technologie\xa0de cloud\xa0hybride\u202f»,\xa0a\xa0déclaré\xa0Warren Perlman,\xa0le DSI de\xa0Ceridian.\xa0«\u202fNous opérons\xa0aussi bien\xa0dans\xa0VMware\xa0Cloud\xa0on\xa0AWS,\xa0AWS\xa0en\xa0natif, que dans Azure\xa0en\xa0natif\u202f»,\xa0a-t-il\xa0ajouté.\xa0«\u202fPour\xa0ce qui est de\xa0nos opérations de cloud privé, en partenariat avec une tierce partie, nous utilisons la suite VMware\u202f»,\xa0a

In [6]:
BG = BigScraper()
row = BG.scrap('https://changethework.com/chatbot-rh-recrutement/')

In [7]:
BG.df

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28T07:25:43+00:00,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]",


In [8]:
row = BG.scrap("https://www.fnccr.asso.fr/article/big-data-territorial-publication-de-letude-de-la-fnccr/")

In [9]:
BG.df

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28T07:25:43+00:00,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]",
1,Si les regards se tournent souvent vers les gr...,[[Si les regards se tournent souvent vers les ...,2017-02-20T14:40:36+01:00,fr,"""Big data territorial"" : Publication de l'étud...",https://www.fnccr.asso.fr/article/big-data-ter...,fnccr,xpath_source,https://www.fnccr.asso.fr/,,,


In [10]:
row = BG.scrap('http://sabbar.fr/management/le-management-strategique-et-le-management-operationnel/#:~:text=Le%20management%20op%C3%A9rationnel%20correspond%20aux,pour%20atteindre%20les%20objectifs%20fix%C3%A9s.')

In [11]:
BG.df

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28T07:25:43+00:00,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]",
1,Si les regards se tournent souvent vers les gr...,[[Si les regards se tournent souvent vers les ...,2017-02-20T14:40:36+01:00,fr,"""Big data territorial"" : Publication de l'étud...",https://www.fnccr.asso.fr/article/big-data-ter...,fnccr,xpath_source,https://www.fnccr.asso.fr/,,,
2,Qu'est-ce que le management ? Le management ...,"[\n, [\n, [ ], \n], \n, [\n, [<span style=""fon...",2014-02-02T09:38:43+00:00,fr_FR,Le management stratégique et le management opé...,http://sabbar.fr/management/le-management-stra...,Sabbar,xpath_source,http://sabbar.fr/management/le-management-stra...,,,


In [17]:
row = BG.scrap('https://www.theinnovation.eu/comment-tuer-linnovation-avec-lanalyse-financiere/45')

In [18]:
BG.df

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28T07:25:43+00:00,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]",
1,Si les regards se tournent souvent vers les gr...,[[Si les regards se tournent souvent vers les ...,2017-02-20T14:40:36+01:00,fr,"""Big data territorial"" : Publication de l'étud...",https://www.fnccr.asso.fr/article/big-data-ter...,fnccr,xpath_source,https://www.fnccr.asso.fr/,,,
2,Qu'est-ce que le management ? Le management ...,"[\n, [\n, [ ], \n], \n, [\n, [<span style=""fon...",2014-02-02T09:38:43+00:00,fr_FR,Le management stratégique et le management opé...,http://sabbar.fr/management/le-management-stra...,Sabbar,xpath_source,http://sabbar.fr/management/le-management-stra...,,,
3,\nFiche de lecture\nInnovation killers : how f...,"[\n, [[<strong>Fiche de lecture</strong>]], \n...",2010-05-14T15:55:01+00:00,fr_FR,Comment tuer l’innovation avec l’analyse finan...,https://www.theinnovation.eu/comment-tuer-linn...,TheInnovation,xpath_source,https://www.theinnovation.eu/,https://s0.wp.com/i/blank.jpg,Guy Parmentier,[]
4,\nFiche de lecture\nInnovation killers : how f...,"[\n, [[<strong>Fiche de lecture</strong>]], \n...",2010-05-14T15:55:01+00:00,fr_FR,Comment tuer l’innovation avec l’analyse finan...,https://www.theinnovation.eu/comment-tuer-linn...,TheInnovation,xpath_source,https://www.theinnovation.eu/,https://s0.wp.com/i/blank.jpg,Guy Parmentier,[]
