In [None]:
import requests
from bs4 import BeautifulSoup
import datetime
from textblob import TextBlob
import pandas as pd
import json
import numpy as np
import re
from requests import get


In [None]:
# [art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth]

In [10]:
class BigScraper:
    cols = ['art_content', 'art_content_html', 'art_published_datetime', 'art_lang', 'art_title',
            'art_url', 'src_name', 'src_type', 'src_url', 'art_img', 'art_auth', 'art_tag']

    def __init__(self):
        self.df = pd.DataFrame(columns=BigScraper.cols)

    def add_row(self, row_scrap):
        if type(row_scrap) == list:
            self.df.loc[len(self.df)] = row_scrap
        elif type(row_scrap) == dict:
            self.df = self.df.append(row_scrap, ignore_index=True)

    def get_base_url(url):
        for val in re.finditer("(\w)+://[^/]+/", url):
            return val.group(0)

    #Jason    
        
    def scrap_changethework(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_html = soup.find('div', {'style': 'text-align: justify;'})
        art_content = art_html.get_text().strip().replace('\xa0', '')
        if soup.find('meta', {'property': 'article:modified_time'})['content'] == None:
            if soup.find('meta', {'property': 'article:published_time'})['content'] == None:
                art_extract_datetime = datetime.date.today()
            else:
                art_extract_datetime = soup.find(
                    'meta', {'property': 'article:published_time'})['content']
                art_extract_datetime = datetime.datetime.strptime(
                    art_extract_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        else:
            art_extract_datetime = soup.find(
                'meta', {'property': 'article:modified_time'})['content']
            art_extract_datetime = datetime.datetime.strptime(
                art_extract_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        art_lang = TextBlob(art_content).detect_language()
        art_title = soup.find('meta', {'property': 'og:title'})['content']
        art_url = soup.find('meta', {'property': 'og:url'})['content']
        src_name = soup.find('meta', {'property': 'og:site_name'})['content']
        src_type = 'xpath_source'
        src_url = BigScraper.get_base_url(art_url)
        src_img = soup.find('meta', {'property': 'og:image'})['content']
        art_auth = [el.get_text().strip() for el in soup.find_all(
            'span', class_='elementor-post-author')]
        art_tag = np.nan
        return [art_content, art_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]

    #Marianne
    
    def scrap_fncrr(url):
        '''Documentation
        Parameters:
            url: url of the scraped page
        Out:
            row: dict of values
        '''
        response = requests.get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        # content, content_html
        try:
            content = html_soup.find("div", {'class': "contenu_c"})
            content_html = content
            content = content.text
        except:
            content_html = np.nan
            content = np.nan
        # date
        if html_soup.find("time", {'class': "updated"}) != None:
            date = html_soup.find("time", {'class': "updated"})
        else:
            date = html_soup.find("time", {'class': 'entry-date published'})
        try:
            date = date['datetime']
            date = datetime.datetime.strptime(
                date, '%Y-%m-%dT%H:%M:%S%z').date()
        except:
            # if no date is specified, put scraping date
            date = datetime.date.today()
        # tag, title
        presentation = html_soup.find("div", {'class': "prensentation"})
        tag = np.nan  # tags are not always interesting
        title = presentation.find("h1")
        title = title.text
        # Remplissage du dataframe
        row = {'art_content': content,
               'art_content_html': content_html,
               'art_published_datetime': date,
               'art_lang': 'fr',
               'art_title': title,
               'art_url': url,
               'src_name': 'fnccr',
               'src_type': 'xpath_source',
               'src_url': BigScraper.get_base_url(url),
               'art_img': np.nan,  # No images
               'art_auth': np.nan,  # No author specified
               'art_tag': tag}
        return row
    
    def scrap_cnil(url):
        
        '''Documentation

        Parameters:
            url: url of the scraped page

        Out:
            new_row: data to put in dataframe
        '''
        req = get(url)
        html_soup = BeautifulSoup(req.text, 'html.parser')
        #content, content_html
        try:
            content_html = html_soup.find("div",{'class':"field-item even"})
            content = content_html.text
        except:
            #maybe find a way to take into account multiple article structures instead
            content_html = np.nan
            content = np.nan
        #date
        date = html_soup.find("div",{'class':"ctn-gen-auteur"}).text
        
        if date is None:
            date = datetime.date.today()
        else:
            print(date)
            trans_month = {'01':['janvier'], 
            '02':['février'],
            '03':['mars'],
            '04':['avril'],
            '05':['mai'],
            '06':['juin'],
            '07':['juillet'],
            '08':['août'],
            '09':['septembre'],
            '10':['octobre'],
            '11':['novembre'],
            '12':['décembre']}
            
            date_tab = date.split(" ")
            day = date_tab[0]
            month = date_tab[1]
            for m in trans_month:
                if month.lower() in trans_month[m]:
                    month = m
            year = date_tab[2]
            date = datetime.date(int(year), int(month), int(day))
           
    
    
        #title
        zone_title = html_soup.find("div",{'class':"ctn-gen-titre"})
        title = zone_title.find("h1")
        title = title.text
        #img
        try:
            zone_img = html_soup.find("div",{'class':"ctn-gen-visuel"})
            img = zone_img.find("img")['src']
        except:
            img = "no_data"
        #tag
        zone_tag = html_soup.find("div",{'class':"mots cles"})
        try:
            tags_li_list = zone_tag.find_all("li")
            tags_list = []
            for tag in tags_li_list:
                tags_list.append(tag.text[1:]) #[1:] to remove "#"
        except:
            tags_list = "no_data"
        # add data to dataframe 
        new_row = {'art_content': content ,
                   'art_content_html': content_html ,
                   'art_published_datetime': date ,
                   'art_lang': 'fr' , 
                   'art_title' : title , 
                   'art_url' : url ,
                   'src_name' :'cnil'  ,
                   'src_type' : 'xpath_source' ,
                   'src_url' : 'https://www.cnil.fr/',
                   'src_img' : img ,
                   'art_auth': "no_data", # No author specified
                   'art_tag': tags_list}
        print(date)
        return new_row
    
    def scrap_jdn(url):
        req = get(url)
        html_soup = BeautifulSoup(req.text, 'html.parser')
        # content_html, content (maybe clean a little the content)
        try:
            content_html = html_soup.find("div",{'id':"jArticleInside"})
            content = content_html.text
        except:
            content_html = np.nan
            content = np.nan
        #date
        try:
            date = html_soup.find("time",{'itemprop':"publishDate"})['datetime']
            format_end = date[-5:]
            date = datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%S+'+format_end)
            date = datetime.date.strftime('%Y-%m-%d')
            #possibly change where the date is extracted
            #see <script type="application/ld+json"> 
        except:
            date = datetime.datetime.today().strftime('%Y-%m-%d')
        #title
        try:
            zone_title = html_soup.find("div",{'id':"jStickySize"})
            title = zone_title.find("h1")
            title = title.text
        except:
            title = "no_data"
        #img
        try:
            zone_img = content_html.find("p",{'class':"app_entry_lead"})
            img = zone_img.find("img")['src']
        except:
            img = "no_data"
        #author
        try:
            link_author = html_soup.find("a",{'rel':"author"})
            author = link_author.text
        except:
            author = "no_data"
        #tags
        head = html_soup.find("head")
        scripts_list = head.find_all("script")
        script = str(scripts_list[1])
        pattern = re.compile("keywords: \[(\"(\w|\-|\d)*\",?)*\]")
        match =  re.search(pattern, script)
        list_tag_str = match.group(0)
        list_tag_str = list_tag_str[11:-1]
        list_tag_str = list_tag_str.replace("-"," ")
        list_tag = list_tag_str.split(",")
        #data 
        new_row = {'art_content': content ,
                   'art_content_html': content_html ,
                   'art_published_datetime': date ,
                   'art_lang': 'fr' , 
                   'art_title' : title , 
                   'art_url' : url ,
                   'src_name' :'journal du net'  ,
                   'src_type' : 'xpath_source' ,
                   'src_url' : 'https://www.journaldunet.com/',
                   'art_img' : img ,
                   'art_auth': author, # No author specified
                   'art_tag': list_tag}
        return new_row
    
    

    def scrap_zdnet(url):
        '''Documentation

        Parameters:
            url: url of the scraped page

        Out:
            new_row: data to put in dataframe
        '''
        req = get(url)
        html_soup = BeautifulSoup(req.text, 'html.parser')
        # content, html_content
        content_html = html_soup.find("div",{'class':"storyBody"})
        content = content_html.text
        # date, author
        zone_infos = html_soup.find("div",{'class':"byline"})
        zone_infos = zone_infos.find("p",{'class':"meta"})
        ## author
        try:
            zone_author = zone_infos.find("span")
            author = zone_author.find("span").text
        except:
            author = "no_data"
        ## date
        date = zone_infos.find("time")['datetime']
        format_end = date[-5:]
        date = datetime.datetime.strptime(date,'%Y-%m-%dT%H:%M:%S+'+format_end)
        date = date.strftime('%Y-%m-%d')
        # title
        title = html_soup.find("h1").text
        # img
        try:
            img = content_html.find("img")['src']
        except:
            img = "no_data"
        #tags
        zone_tags = html_soup.find("p",{'class':"relatedTopics"})
        list_tags_links = zone_tags.find_all("a")
        list_tags = []
        for link in list_tags_links:
            list_tags.append(link.text)
        # data to add in dataframe 
        new_row = {'art_content': content_html ,
                   'art_content_html': content ,
                   'art_published_datetime': date ,
                   'art_lang': 'fr' , 
                   'art_title' : title , 
                   'art_url' : url ,
                   'src_name' :'zdnet',
                   'src_type' : 'xpath_source',
                   'src_url' : 'https://www.zdnet.fr/',
                   'src_img' : img ,
                   'art_auth': author ,
                   'art_tag': list_tags}
        return new_row


    

    #Louis
    
    def scrap_sabbar(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        # Récupération du contenu de la page web (les paragraphes, avec et sans les balises html)
        art_content_html = soup.find('div', class_="entry-content")
        art_content = art_content_html.get_text().replace("\xa0", "").strip()
        # Extraction de la date de l'article
        art_extract_datetime = json.loads(soup.find(
            'script', class_='yoast-schema-graph yoast-schema-graph--main').get_text())['@graph'][1]['dateModified']
        art_extract_datetime = datetime.datetime.strptime(
            art_extract_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        # Langue de l'article
        art_lang = soup.find('meta', property="og:locale").get('content')
        # Titre
        art_title = soup.find('meta', property="og:title").get('content')
        # Url
        art_url = soup.find('link', rel='canonical').get('href')
        # Nom de la source
        src_name = "Sabbar"
        # Type de la source
        src_type = "xpath_source"
        # url source
        src_url = BigScraper.get_base_url(art_url)
        # Image(s)
        src_img = np.nan
        # Auteur de l'article
        art_auth = np.nan
        # Tag de l'auteur
        art_tag = np.nan

        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, \
                src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def scrap_lebigdata(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_content_html = soup.find("article")
        art_content = art_content_html.get_text().replace('\xa0', '')
        if soup.find('meta', property='article:modified_time') is not None:
            art_published_datetime = soup.find(
                'meta', property='article:modified_time').get('content')
            art_published_datetime = datetime.datetime.strptime(
                art_published_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        elif soup.find('meta', property='article:published_time') is not None:
            art_published_datetime = soup.find(
                'meta', property='article:published_time').get('content')
            art_published_datetime = datetime.datetime.strptime(
                art_published_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        else:
            art_published_datetime = datetime.date.today()
        art_lang = soup.find('meta', property='og:locale').get('content')
        art_title = soup.find('meta', property="og:title").get('content')
        art_url = soup.find('meta', property="og:url").get('content')
        src_name = soup.find('meta', property="og:site_name").get('content')
        src_type = "xpath_source"
        src_url = BigScraper.get_base_url(art_url)
        art_img = soup.find('meta', property="og:image").get('content')
        art_auth = soup.find(
            'meta', attrs={'name': "twitter:data1"}).get('content')
        art_tag = np.nan
        return [art_content, art_content_html, art_published_datetime, art_lang, art_title, art_url, \
                src_name, src_type, src_url, art_img, art_auth, art_tag]
    
    def scrap_cadre(url):
        
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        art_content_html = soup.find('div', class_ = 'td-post-content').find_all('p')
        art_content = "".join([x.text for x in art_content_html])

        #Extraction de la date de l'article
        art_extract_datetime = soup.find('meta', property = "article:modified_time").get('content')

        #Langue de l'article
        art_lang = soup.find('meta', property = "og:locale").get('content')

        #Titre
        art_title = soup.find('meta', property = "og:title").get('content')

        #Url
        art_url = soup.find('link', rel = 'canonical').get('href')

        #Nom de la source
        src_name = soup.find('meta', property = "og:site_name").get('content')

        #Type de la source
        src_type = "xpath_source"

        #url source
        src_url = soup.find('form', class_ = 'td-search-form').get('action')

        #Image(s)
        src_img = soup.find('meta', property = "og:image").get('content')

        #Auteur de l'article
        art_auth = soup.find('div', class_ = "td-post-author-name").text

        #Tag de l'auteur
        art_tag = np.nan

        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title,\
            art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    
    def scrap_sap(url):
        
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
    
        art_content_html = soup.find_all('div', class_="parContent")
        art_content = " ".join([x.text for x in art_content_html])

        date = soup.find('video', class_='video-js vjs-default-skin vjs-big-play-centered vjs-fluid').get('data-publishingdate')
        art_published_datetime = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z').date()

        art_lang = soup.find('meta', attrs={'name':"language"}).get('content')

        art_title = soup.find('meta', property = "og:title").get('content')

        art_url = soup.find('meta', property = "og:url").get('content')

        src_name = soup.find('meta', property = "og:site_name").get('content')

        src_type = "xpath_source"

        src_url = soup.find('meta', property = "og:site_name").get('content') + ".com"

        src_img = soup.find('meta', property = "og:image").get('content') 

        art_auth = "no_data"

        art_tag = soup.find('meta', attrs={'name':"keywords"}).get('content')

        return [art_content, art_content_html, art_published_datetime, art_lang, art_title, art_url, \
                src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def scrap_datagouv(url):
    
    
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        if soup.find_all('section', class_="content noncertified") == [] :
            art_content_html = soup.find_all('section', class_="content certified")
        else :
            art_content_html = soup.find_all('section', class_="content noncertified")

        art_content = " ".join([x.text for x in art_content_html])

        art_extract_datetime = datetime.date.today()

        art_lang = soup.find('html').get('lang')

        art_title = soup.find('meta', property = "og:title").get('content')

        art_url = soup.find('link', rel = 'canonical').get('href')

        src_type = "xpath_source"

        for val in re.finditer("(\w)+://[^/]+/", url):
            src_url = val.group(0)

        src_name = src_url.replace('https://','').replace('/','')

        src_img = soup.find('meta', property = "og:image").get('content')

        art_auth = soup.find('link', rel='author').get('href')

        art_tag = []
        tags = soup.find_all('a', class_ = "label label-default")
        for x in tags:
            art_tag.append(x.get('title'))

        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, \
                src_name, src_type, src_url, src_img, art_auth, art_tag] 
    
    def scrap_blockchain(url):
    
    
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        art_content_html = soup.find_all('div', class_="site-content")
        art_content = " ".join([x.text for x in art_content_html])

        date = soup.find('meta', property='article:modified_time').get('content')
        art_extract_datetime = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z').date()    

        art_lang = soup.find('meta', attrs={'property':"og:locale"}).get('content')

        art_title = soup.find('meta', property = "og:title").get('content')

        art_url = soup.find('meta', property = "og:url").get('content')

        src_type = "xpath_source"

        for val in re.finditer("(\w)+://[^/]+/", url):
            src_url = val.group(0)

        src_name = src_url.replace('https://','').replace('/','')

        src_img = soup.find('meta', property = "og:image").get('content')

        art_auth = "no_data"

        if soup.find('meta', attrs={'name':"keywords"}) :
            art_tag =soup.find('meta', attrs={'name':"keywords"})   
        else :
            art_tag = "no_data"

        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, \
                src_name, src_type, src_url, src_img, art_auth, art_tag]
    


    #Michael

    def scrap_theinnovation(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_content_html = soup.find("div", {"class": "entry-content"})
        art_content = art_content_html.text.replace('\xa0', '')
        if soup.find("meta", {"property": "article:modified_time"}) != None:
            art_extract_datetime = soup.find(
                "meta", {"property": "article:modified_time"})["content"]
            art_extract_datetime = datetime.datetime.strptime(
                art_extract_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        elif soup.find("meta", {"property": "article:published_time"}) != None:
            art_extract_datetime = soup.find(
                "meta", {"property": "article:published_time"})["content"]
            art_extract_datetime = datetime.datetime.strptime(
                art_extract_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        else:
            art_extract_datetime = datetime.date.today()
        art_lang = soup.find("meta", {"property": "og:locale"})["content"]
        art_title = soup.find("meta", {"property": "og:title"})["content"]
        art_url = soup.find("meta", {"property": "og:url"})["content"]
        src_name = soup.find("meta", {"property": "og:site_name"})["content"]
        src_type = "xpath_source"  # default value
        src_url = BigScraper.get_base_url(art_url)
        src_img = soup.find("meta", {"property": "og:image"})["content"]
        art_auth = soup.find("a", {"rel": "author"}).text
        art_tag = soup.find("meta", {"name": "keywords"})["content"].split(',')
        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url,
                src_name, src_type, src_url, src_img, art_auth, art_tag]
    
    def scrap_myrhline(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        art_content_html = soup.find("div", {"class": "post-detail-wrap"})
        if art_content_html is None:
            art_content_html = "no_data"

        art_content = art_content_html.get_text().replace('\xa0', '').strip()
        if art_content is None:
            art_content = "no_data"

        if soup.find("meta", {"property": "article:modified_time"}) is not None:
            art_published_datetime = soup.find("meta", {"property": "article:modified_time"})["content"]
            art_published_datetime = datetime.datetime.strptime(art_published_datetime, '%Y-%m-%dT%H:%M:%S%z').date()

        elif soup.find("meta", {"property": "article:published_time"}) is not None:
            art_published_datetime = soup.find("meta", {"property": "article:published_time"})["content"]
            art_published_datetime = datetime.datetime.strptime(art_published_datetime, '%Y-%m-%dT%H:%M:%S%z').date()

        else:
            art_published_datetime = datetime.date.today()

        if art_content is not None:
            art_lang = TextBlob(art_content).detect_language()
        elif soup.find("meta", {"property": "og:locale"}) is not None:
            art_lang = soup.find("meta", {"property": "og:locale"})["content"]   
        else:
            art_lang = "no_data"

        if soup.find("meta", {"property": "og:title"}) is not None:
            art_title = soup.find("meta", {"property": "og:title"})["content"]
        elif soup.find("title") is not None:
            art_title = soup.find("title").text
        else:
            art_title = "no_data"

        if soup.find("meta", {"property": "og:url"}) is not None:
            art_url = soup.find("meta", {"property": "og:url"})["content"]
        else:
            art_url = url

        if soup.find("meta", {"property": "og:site_name"}) is not None:
            src_name = soup.find("meta", {"property": "og:site_name"})["content"]
        else:
            src_name = "no_data"

        src_type = "xpath_source" #default value  

        src_url = BigScraper.get_base_url(art_url)

        if soup.find("meta", {"property": "og:image"}) is not None:
            art_img = soup.find("meta", {"property": "og:image"})["content"]
        else:
            art_img = "no_data"

        if soup.find("meta", {"name": "author"}) is not None:   
            art_auth = soup.find("meta", {"name": "author"})["content"]
        else:
            art_auth = "no_data"

        if soup.find_all("a", {"rel": "tag"}) is not None:
            art_tag = [tag.text for tag in soup.find_all("a", {"rel": "tag"})]
        else:
            art_tag = "no_data"


        return [art_content, art_content_html, art_published_datetime, art_lang, art_title, art_url,\
            src_name, src_type, src_url, art_img, art_auth, art_tag]
    
    def scrap_usinedigitale(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        #retrieval of the html content
        art_content_html = soup.find("article", {"class":"contenuArticle"})
        if art_content_html is None:
            art_content_html = "no_data"

        #retrieval of the article content    
        art_content = art_content_html.text
        if art_content is None:
            art_content = "no_data"   

        #retrieval of the publication/modification date    
        if soup.find("time", {"class": "dateEtiquette3"}) is None:
            art_published_datetime = datetime.datetime.now()
        else:    
            art_published_datetime = soup.find("time", {"class": "dateEtiquette3"})["datetime"]
            art_published_datetime = datetime.datetime.strptime(art_published_datetime, '%Y-%m-%dT%H:%M').date()

        #retrieval of the language
        if art_content is not None:
            art_lang = TextBlob(art_content).detect_language()    
        elif soup.find("meta", {"property": "og:locale"}) is not None:
            art_lang = soup.find("meta", {"property": "og:locale"})["content"]
        else:
            art_lang = "no_data"

        #retrieval of the title
        if soup.find("meta", {"property": "og:title"}) is not None:
            art_title = soup.find("meta", {"property": "og:title"})["content"]
        elif soup.find("title") is not None:
            art_title = soup.find("title").text
        else:
            art_title = "no_data"

        #retrieval of the article url   
        art_url = soup.find("meta", {"property": "og:url"})

        if art_url is None:
            art_url = url
        else:    
            art_url = art_url["content"]

        #retrieval of the website name
        if soup.find("meta", {"name": "ipd:siteName"}) is not None:
            src_name = soup.find("meta", {"name": "ipd:siteName"})["content"]
        elif soup.find("meta", {"property": "og:site_name"}) is not None:
            src_name = soup.find("meta", {"property": "og:site_name"})["content"]
        else:
            src_name = "no_data"

        #retrieval of the source type
        src_type = "xpath_source" #default value

        #retrieval of the source url
        src_url = BigScraper.get_base_url(art_url)
        if src_url is None:
            src_url = "no_data"

        #retrieval of the article image
        art_img = soup.find("meta", {"property": "og:image"})
        if art_img is None:
            art_img = "no_data" 
        else:
            art_img = art_img["content"]

        #retrieval of the article author
        art_auth = soup.find("a", {"class": "nomAuteur"})
        if art_auth is None:   
            art_auth = "no_data"
        else:
            art_auth = art_auth.text

        #retrieval of the article tags
        if soup.find_all("a", {"rel": "tag"}) is not None:
            art_tag = [tag.text for tag in soup.find_all("a", {"rel": "tag"}) ]
        else:
            art_tag = "no_data"


        return [art_content, art_content_html, art_published_datetime, art_lang, art_title,\
            art_url, src_name, src_type, src_url, art_img, art_auth, art_tag]

    #Rémy
    
    def scrap_lemondeinformatique(url):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_content_html = soup.find('div', class_='article-body')
        art_content = art_content_html.get_text().replace('\xa0', '').strip()
        if soup.find("meta", {"itemprop": "datePublished"}) is not None:
            art_extract_datetime = soup.find(
                "meta", {"itemprop": "datePublished"})['content']
            art_extract_datetime = datetime.datetime.strptime(
                art_extract_datetime, '%Y-%m-%dT%H:%M:%S%z').date()
        else:
            art_extract_datetime = datetime.date.today()
        art_langue = TextBlob(get_Title(art_content)).detect_language()
        art_title = soup.find("meta", {"property": "og:title"})["content"]
        art_url = soup.find("meta", {"property": "og:url"})["content"]
        src_name = soup.find("meta", {"property": "og:site_name"})["content"]
        src_type = 'xpath_source'
        src_url = BigScraper.get_base_url(art_url)
        src_img = soup.find("meta", {"property": "og:image"})["content"]
        art_auth = soup.find(
            "div", class_="author-infos").find("b", {"itemprop": "name"}).get_text()
        art_tag = [el.get_text()
                   for el in soup.find_all("a", {"rel": "category tag"})]
        return [art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag]

    def scrap_erudit(url : str) -> tuple :
        """
        function which from a url returns all the data collected with the functions above in a tuple
        """
        req = get(url)
        html_soup = BeautifulSoup(req.text, 'html.parser')
        #Retrieval of the content of the article with the html tags
        art_content_html = html_soup.find("section",{"id":"s1n1"})
        #Removal of the html tags and replacement of '\xa0' by ''
        art_content = art_content_html.text.replace('\xa0','')
        #Retrieval of the date and conversion to the datetime format
        art_published_datetime = datetime.datetime.strptime(html_soup.find("meta",{"name":"citation_online_date"})['content'], "%Y/%m/%d").date()
        #Analysis of the language of the text with the TextBlob library
        art_lang = TextBlob(art_content).detect_language()
        #Retrieval of the title in meta property, replacing '\xa0' by ''
        art_title = html_soup.find("meta",{"property":"og:title"})['content'].replace('\xa0','')
        #Retrieval of the url in meta property
        art_url = html_soup.find("meta",{"property":"og:url"})['content']
        #Retrieval of the website's name in meta property
        src_name = html_soup.find("meta",{"property":"og:site_name"})['content']
        src_type = 'xpath_source' #default value 
        src_url = 'https://www.erudit.org/fr/'
        #Concatenation of the base url of the website and the end of the url of the image representing the article
        art_img = 'https://www.erudit.org'+html_soup.find("meta",{"property":"og:image"})['content']
        #Retrieval of a list of the author(s) of the article
        art_auth = [el.text.replace('\n      ',' ') for el in html_soup.find_all("span",{"class":"nompers"})]
        #No tags found on this website
        art_tag = 'no_data'
        return [art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,\
                src_type,src_url,art_img,art_auth,art_tag]
    
    def scrap_citmar(url):
        
        req = get(url)
        html_soup = BeautifulSoup(req.text, 'html.parser')

        paragraphe = html_soup.find_all('p')
        art_content_html=" ".join([str(x) for x in paragraphe])
        art_content=" ".join([x.text for x in paragraphe])

        art_published_datetime = html_soup.find("time",{"class":"entry-date published"})['content']

        language = TextBlob(art_content)
        art_lang = language.detect_language()

        art_title = html_soup.find("h1",{"class":"hestia-title entry-title"}).text

        art_url = url

        src_name = 'citoyen-ne-s-de-marseille'

        src_type = 'xpath_source'

        src_url = 'https://citoyen-ne-s-de-marseille.fr/'

        art_img = 'no_data'

        art_auth = html_soup.find("strong",{"class":"fn"}).text

        art_tag = 'no_data'

        return [art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,\
                src_name,src_type,src_url,art_img,art_auth,art_tag]

    def scrap_digitrec(url):
        
        response = requests.get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')

        paragraphs = html_soup.find_all('p')
        art_content_html=" ".join([str(x) for x in paragraphs])
        art_content=" ".join([x.text for x in paragraphs])

        Datetemp = html_soup.find("ul",{"class":"list-inline infos"}).text
        art_extract_datetime = Datetemp.split("\n")[4]

        a = TextBlob(art_content)
        art_lang = a.detect_language()

        art_title = html_soup.find("meta",{"property":"og:title"})['content']

        art_url = url

        for val in re.finditer("(\w)+://[^/]+/", url):
            src_url = val.group(0)

        src_name = src_url.replace('https://','').replace('/','')

        src_type = 'xpath_source'

        src_img = html_soup.find("meta",{"property":"og:image"})['content']

        authortemp1 = html_soup.find("ul",{"class":"list-inline infos"}).text
        authortemp2 = authortemp1.split("\n")[5]
        author = authortemp2.split(" ")[1:]
        art_auth = str(author[0]+" "+author[1])

        art_tag = 'no_data'

        return [art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,\
                src_name,src_type,src_url,src_img,art_auth,art_tag]
    
    def scrap_hellofuture(url : str) -> tuple : #Scraping Rémy HelloFuture Orange
  
        req = get(url)
        html_soup = BeautifulSoup(req.text, 'html.parser')

        paragraphe = html_soup.find_all('p')
        art_content_html=" ".join([str(x) for x in paragraphe])
        art_content=" ".join([x.text for x in paragraphe])   

        Datetemp = html_soup.find("div",{"class":"article__content--author"})
        art_published_datetime = Datetemp.find("time")["datetime"]    

        a = TextBlob(art_content)
        art_lang = a.detect_language()

        art_title = html_soup.find("h1",{"class":"h1"}).text

        art_url = url

        src_name = 'hello_future_orange'

        src_type = 'xpath_source'

        for val in re.finditer("(\w)+://[^/]+/", url):
            src_url = val.group(0)    

        if html_soup.find("div",{"class":"article__media"}) :
            art_img = html_soup.find("div",{"class":"article__media"}).find("img")["src"]
        else :
            art_img = 'no_data'

        art_auth = 'nodata'

        art_tag = html_soup.find("div",{"class":"article__tag"}).find('img')['alt']

        return [art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,\
        src_name,src_type,src_url,art_img,art_auth,art_tag]
    
    def scrap_silicon(url : str) -> tuple : #Scraping Rémy silicon.fr
  
        req = get(url)
        html_soup = BeautifulSoup(req.text, 'html.parser')

        #Retrieval of the content of the article with the html tags
        art_content_html = html_soup.find("section",{"class":"article-content"})

        #Removal of the html tags and replacement of '\xa0' by ''
        art_content = art_content_html.text.replace('\xa0',' ')   

        #Retrieval of the date and conversion to the datetime format
        art_published_datetime = datetime.datetime.strptime(html_soup.find("meta",{"itemprop":"datePublished"})['content'],\
                                                   "%Y-%m-%dT%H:%M:%S%z").date() 
    
        #Analysis of the language of the text with the TextBlob library
        art_lang = TextBlob(art_content).detect_language()

        #Retrieval of the title in meta property, replacing '\xa0' by ''
        art_title = html_soup.find("meta",{"property":"og:title"})['content'].replace('\xa0',' ')

        art_url = url

        #Retrieval of the website's name in meta property
        src_name = html_soup.find("meta",{"property":"og:site_name"})['content']

        src_type = 'xpath_source'

        for val in re.finditer("(\w)+://[^/]+/", url):
            src_url = val.group(0)    

        #Retrieval of the image representing the article
        #Because this website the image can be found at two different places in the html code we use a if/else condition
        if html_soup.find("picture",{"class":"img"}) is not None:
            art_img = html_soup.find("picture",{"class":"img"}).find("source")["srcset"]
        else:
            art_img = html_soup.find("meta",{"itemprop":"image"})["content"]

        #Retrieval of the author of the article
        art_auth = html_soup.find("meta",{"itemprop":"author"})['content']

        #Retrieval of the tag(s) of the article in meta property, if there are no tags we return 'no_data'
        art_tag = [el['content'] for el in html_soup.find_all("meta",{"property":"article:tag"})]
        if art_tag == []:
            art_tag = 'no_data'

        return [art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,\
            src_name,src_type,src_url,art_img,art_auth,art_tag]
    
    #Sibel
    
    def scrap_riskinsight(url):
        """
        This function add different information of the webpage/article to the dataFrame
        """
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        art_content_html = soup.find('article')
        art_content = art_content_html.get_text()
        src_type = 'xpath_source'
        art_url = soup.find('meta', {'property': 'og:url'})['content']
        src_url = BigScraper.get_base_url(art_url)
        src_name = soup.find('meta', {'property': 'og:site_name'})['content']
        if soup.find("meta", {"name": "twitter:data1"}) is not None:
            art_auth = soup.find("meta", {"name": "twitter:data1"})['content']
        else:
            art_auth = 'no_data'
        if soup.find("meta", {"property": "article:modified_time"}) is not None:
            date = soup.find("meta", {"property": "article:modified_time"})[
                "content"]
            art_published_datetime = datetime.datetime.strptime(
                date, '%Y-%m-%dT%H:%M:%S%z').date()
        elif soup.find("meta", {"property": "article:published_time"}) is not None:
            date = soup.find("meta", {"property": "article:published_time"})[
                "content"]
            art_published_datetime = datetime.datetime.strptime(
                date, '%Y-%m-%dT%H:%M:%S%z').date()
        else:
            art_published_datetime = datetime.date.today()
        art_title = soup.title.get_text()
        art_lang = TextBlob(art_content).detect_language()
        if soup.find_all("a", {'class': "tag--link"}) is not None:
            art_tag = [el.get_text()
                       for el in soup.find_all("a", {'class': "tag--link"})]
        else:
            art_tag = 'no_data'
        if soup.find("meta", {"property": "og:image"}) is not None:
            art_img = soup.find(
                "meta", {"property": "og:image"})["content"]
        else:
            art_img = 'no_data'
        return [art_content, art_content_html, art_published_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, art_img, art_auth, art_tag]

    def scrap_parlonsrh(url):
        response = requests.get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        #content
        paragraphe = html_soup.find_all('p')
        content = " ".join([x.text for x in paragraphe])
        #content_html
        content_html = " ".join([str(x) for x in paragraphe])
        #time
        time = html_soup.find("span", {"class":"date updated value-title"})["title"]
        if time is None or time==[ ]:
            time = 'no data'
        else:
            trans_month = {'01':['janvier'], 
                 '02':['février'],
                 '03':['mars'],
                 '04':['avril'],
                 '05':['mai'],
                 '06':['juin'],
                 '07':['juillet'],
                 '08':['août'],
                 '09':['septembre'],
                 '10':['octobre'],
                 '11':['novembre'],
                 '12':['décembre']}
            date_tab = time.split(" ")
            day = date_tab[0]
            month = date_tab[1]
            for m in trans_month:
                if month.lower() in trans_month[m]:
                    month = m
            year = date_tab[2]
            time = datetime.date(int(year), int(month), int(day))
        #title
        html_title = html_soup.title
        title = html_title.get_text()
        #img
        img = html_soup.find('meta', {'property':'og:image'})['content']
        if img is None:
            img = 'no_data'
        #author
        author = html_soup.find("span",{"class":"fn"}).get_text()
        if author[11:29]=='La Team Parlons RH':
            author = author[11:29] 
        else:
            author = author[11:34]
        #tag
        html_tag = html_soup.find_all("meta",{'property':"article:tag"})
        tags = []
        for i in html_tag:
            tag_i = i['content']
            tags.append(tag_i)
        if tags is None or tags==[ ]:
            tags = 'no data'
        new_row = {'art_content': content ,
                   'art_content_html': content_html ,
                   'art_published_datetime': time ,
                   'art_lang': 'fr' , 
                   'art_title' : title , 
                   'art_url' : url ,
                   'src_name' : 'parlonsrh' ,
                   'src_type' : 'xpath_source' ,
                   'src_url' : 'https://www.parlonsrh.com/' ,
                   'src_img' : img ,
                   'art_auth': author,
                   'art_tag': tags}
        return new_row
    
    def scrap_inserm(url : 'str'):
        """Documentation
        function which from a url creates a BeautifulSoup object, then extract different informations about the article and the 
        source. Then researches informations about the article and the website. It finally returns all this data as a list

        Parameters:
            url(str): The url that we will scrap 

        Out:
            new_row: it contains some propreties of the article and the sources 

        """
        response = requests.get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')

        src_type = 'xpath_source'
        src_url = 'https://www.inserm.fr/'
        src_name = 'Inserm'
        art_url = url
        art_published_datetime = html_soup.find('time').get_text()
        art_title = html_soup.title.get_text()
        art_img = 'no_data'
        html_tag = html_soup.find("a",{'rel':"category"})
        if html_tag is None:
            art_tag = 'no_data'
        else:
            art_tag = html_tag.get_text()
        paragraphe = html_soup.find_all('p')
        art_content_html = " ".join([str(x) for x in paragraphe])
        paragraphe = html_soup.find_all('p')
        art_content = " ".join([x.text for x in paragraphe])
        a = TextBlob(art_title)
        art_lang = a.detect_language()
        art_auth = "no_data"
        new_row = {'art_content': art_content ,
                   'art_content_html': art_content_html ,
                   'art_published_datetime': art_published_datetime ,
                   'art_lang': art_lang , 
                   'art_title' : art_title , 
                   'art_url' : art_url ,
                   'src_name' : src_name ,
                   'src_type' : src_type ,
                   'src_url' : src_url ,
                   'src_img' : art_img ,
                   'art_auth': art_auth ,
                   'art_tag': art_tag }

        return new_row
    
    def scrap_lemonde(url : 'str'):
        """Documentation
        function which from a url creates a BeautifulSoup object, then extract different informations about the article and the 
        source. Then researches informations about the article and the website. It finally returns all this data as a dict

        Parameters:
            url(str): The url that we will scrap 

        Out:
            new_row: it contains some propreties of the article and the sources 

        """
        response = requests.get(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')

        src_type = 'xpath_source'
        src_url = 'https://www.lemonde.fr/'
        src_name = "Le Monde"

        # find the article URL (art_url) 
        if html_soup.find("meta", {"property": "og:url"}) is not None:
            art_url = html_soup.find("meta", {"property": "og:url"})['content']
        else:
            art_url = url

        # find the article Title (art_title)  
        art_title = html_soup.title.get_text().replace('\xa0','')


        # find the article Author (art_auth)  
        if html_soup.find("meta",{"property":"og:article:author"}) is not None:
            art_auth = html_soup.find("meta",{"property":"og:article:author"})['content']
            if art_auth==[]: art_auth = 'no-data' 
        else:
            art_auth = 'no_data'


        # find the date of publication of the article (art_published_datetime) (format: datetime)  
        if html_soup.find("meta", {"property":"og:article:published_time"}) is not None:
            art_published = html_soup.find("meta", {"property":"og:article:published_time"})['content'][:10]      # take the date and remove the hour
            art_published_datetime = datetime.datetime.strptime(art_published,'%Y-%m-%d').date()  # put at the format datetime      
        # if there is no date, we replace None with the date of today
        else:
            art_published_datetime = datetime.datetime.today().date()     # 


        #src_img  
        if html_soup.find("figure", {"class":"article__media"}) is not None:
            art_img = html_soup.find("figure", {"class":"article__media"}).find('img')['src']
        else:
            art_img = 'no_data'


        #art_tag  
        #art_tag = json.loads(html_soup.find('script', type = 'application/ld+json')['@type'])
        art_tag = "no_data"


        #art_content_html  and  art_content
        try:
            art_content_html_corps = html_soup.find("article", {"class": "article__content old__article-content-single"})
            if html_soup.find("p", {"class": "article__desc"}) is not None:
                art_content_html_intro = html_soup.find("p", {"class": "article__desc"}) #prend une sous balise en trop 
                art_content_html = [art_content_html_intro,art_content_html_corps]  
                art_content = (art_content_html_intro.get_text() + art_content_html_corps.get_text()).replace('\xa0','')
            else:
                art_content_html = art_content_html_corps
                art_content = art_content_html_corps.get_text().replace('\xa0','')
                # REPLACE NE MARCHE PAS
        except:
            #problems with https://www.lemonde.fr/transition-ecologique/article/2020/08/03/les-villes-et-leurs-jumeaux-numeriques_6048030_179.html
            art_content_html = np.nan
            art_content = np.nan

        #art_lang   
        art_lang = 'fr'

        #return art_content
        new_row = {'art_content': art_content ,
                   'art_content_html': art_content_html ,
                   'art_published_datetime': art_published_datetime ,
                   'art_lang': art_lang , 
                   'art_title' : art_title , 
                   'art_url' : art_url ,
                   'src_name' : src_name ,
                   'src_type' : src_type ,
                   'src_url' : src_url ,
                   'src_img' : art_img ,
                   'art_auth': art_auth ,
                   'art_tag': art_tag }
        return new_row

 
    def assign_scraper(url):
        #Jason
        if 'https://changethework.com/' in url:
            return BigScraper.scrap_changethework(url)
        #Marianne
        elif 'https://www.fnccr.asso.fr/article/' in url:
            return BigScraper.scrap_fncrr(url)
        elif 'https://www.cnil.fr/' in url:
            return BigScraper.scrap_cnil(url)
        elif 'https://www.journaldunet.com/' in url:
            return BigScraper.scrap_jdn(url)
        elif 'https://www.zdnet.fr/' in url:
            return BigScraper.scrap_zdnet(url)
        #Louis
        elif 'http://sabbar.fr/' in url:
            return BigScraper.scrap_sabbar(url)
        elif 'https://www.lebigdata.fr/' in url:
            return BigScraper.scrap_lebigdata(url)
        elif 'https://www.cadre-dirigeant-magazine.com/' in url:
            return BigScraper.scrap_cadre(url)
        elif 'https://www.sap.com/' in url:
            return BigScraper.scrap_sap(url)
        elif 'https://www.data.gouv.fr/' in url:
            return BigScraper.scrap_datagouv(url)
        elif 'https://blockchainfrance.net' in url:
            return BigScraper.scrap_blockchain(url)
        #Michael
        elif 'https://www.theinnovation.eu/' in url:
            return BigScraper.scrap_theinnovation(url)
        elif 'https://www.myrhline.com/' in url:
            return BigScraper.scrap_myrhline(url)
        elif 'https://www.usine-digitale.fr/' in url:
            return BigScraper.scrap_usinedigitale(url)
        #Rémy
        elif 'https://www.lemondeinformatique.fr/' in url:
            return BigScraper.scrap_lemondeinformatique(url)
        elif 'https://www.erudit.org/fr/' in url:
            return BigScraper.scrap_erudit(url)
        elif 'https://citoyen-ne-s-de-marseille.fr/' in url:
            return BigScraper.scrap_citmar(url)
        elif 'https://www.digitalrecruiters.com/' in url:
            return BigScraper.scrap_digitrec(url)
        elif 'https://hellofuture.orange.com/' in url:
            return BigScraper.scrap_hellofuture(url)
        elif 'https://www.silicon.fr/' in url:
            return BigScraper.scrap_silicon(url)
        #Sibel
        elif 'https://www.riskinsight-wavestone.com/' in url:
            return BigScraper.scrap_riskinsight(url)
        elif 'https://www.parlonsrh.com/' in url:
            return BigScraper.scrap_parlonsrh(url)
        elif 'https://www.inserm.fr/' in url:
            return BigScraper.scrap_inserm(url)
        elif 'https://www.lemonde.fr/' in url:
            return BigScraper.scrap_lemonde(url)


        return None

    def scrap(self, url):
        row = BigScraper.assign_scraper(url)
        self.add_row(row)
        return row

In [11]:
#     cols = ['art_content', 'art_content_html', 'art_published_datetime', 'art_lang', 'art_title',
#             'art_url', 'src_name', 'src_type', 'src_url', 'art_img', 'art_auth', 'art_tag']

In [12]:
BG = BigScraper()

In [13]:
row = BG.scrap('https://changethework.com/chatbot-rh-recrutement/')

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.fnccr.asso.fr/article/big-data-territorial-publication-de-letude-de-la-fnccr/")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.cnil.fr/fr/video-le-youtubeur-cookie-connecte-repond-vos-questions-sur-larrivee-du-rgpd")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.journaldunet.com/solutions/dsi/1496655-la-cybersecurite-est-une-question-informationnelle/")

In [None]:
BG.df

In [None]:
row = BG.scrap('http://sabbar.fr/management/le-management-strategique-et-le-management-operationnel/#:~:text=Le%20management%20op%C3%A9rationnel%20correspond%20aux,pour%20atteindre%20les%20objectifs%20fix%C3%A9s.')

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.cadre-dirigeant-magazine.com/manager/la-recherche-operationnelle-un-formidable-outil-daide-a-la-decision/")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.sap.com/france/products/erp-financial-management/grc.html")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.data.gouv.fr/fr/datasets/repertoire-national-des-elus-1/")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://blockchainfrance.net/decouvrir-la-blockchain/c-est-quoi-la-blockchain/")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.erudit.org/fr/revues/ateliers/2019-v14-n2-ateliers05462/1071130ar/")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://citoyen-ne-s-de-marseille.fr/encore-un-immeuble-prive-finance-par-la-metropole/")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.digitalrecruiters.com/blog/comment-la-crise-ravive-quete-de-sens-au-travail.html")

In [None]:
BG.df

In [None]:
row = BG.scrap('https://www.theinnovation.eu/comment-tuer-linnovation-avec-lanalyse-financiere/45')

In [None]:
BG.df

In [None]:
row = BG.scrap('https://www.parlonsrh.com/raisons-utiliser-lintelligence-artificielle-dans-gestion-gpec/')

In [None]:
BG.df

In [None]:
row = BG.scrap("https://hellofuture.orange.com/fr/hologramme-quatre-exemples-dune-technologie-revolutionnaire/")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.lebigdata.fr/base-de-donnees")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.silicon.fr/emploi-it-recrutements-2021-356166.html")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.inserm.fr/actualites-et-evenements/actualites/ondes-electromagnetiques-faut-il-craindre-5g")

In [None]:
BG.df

In [None]:
row = BG.scrap("https://www.zdnet.fr/actualites/la-data-est-notre-or-noir-mais-quel-est-son-moteur-39881697.htm")

In [None]:
BG.df

In [6]:
row = BG.scrap("https://www.usine-digitale.fr/article/le-specialiste-du-jeu-video-niantic-acquiert-la-plateforme-de-jeu-social-mayhem.N1046614")

In [7]:
BG.df

Unnamed: 0,art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,art_img,art_auth,art_tag
0,\n\n\nLe spécialiste du jeu vidéo Niantic acqu...,"[\n, [\n, [], \n, [Le spécialiste du jeu vidéo...",2021-01-07,fr,Le spécialiste du jeu vidéo Niantic acquiert l...,https://www.usine-digitale.fr/article/le-speci...,L'Usine Digitale,xpath_source,https://www.usine-digitale.fr/,https://www.usine-digitale.fr/mediatheque/4/4/...,Aude Chardenon,"[Jeux Video, Entertainment, Loisirs numériques]"


In [8]:
row = BG.scrap("https://www.myrhline.com/actualite-rh/de-la-gpec-et-au-workforce-planning-les-5-evolutions-a-connaitre.html")

In [9]:
BG.df

Unnamed: 0,art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,art_img,art_auth,art_tag
0,\n\n\nLe spécialiste du jeu vidéo Niantic acqu...,"[\n, [\n, [], \n, [Le spécialiste du jeu vidéo...",2021-01-07,fr,Le spécialiste du jeu vidéo Niantic acquiert l...,https://www.usine-digitale.fr/article/le-speci...,L'Usine Digitale,xpath_source,https://www.usine-digitale.fr/,https://www.usine-digitale.fr/mediatheque/4/4/...,Aude Chardenon,"[Jeux Video, Entertainment, Loisirs numériques]"
1,"Historiquement, la GPEC est un exercice mené d...","[[ , [<i aria-hidden=""true"" class=""fa fa-share...",2020-10-27,fr,De la GPEC et au Workforce Planning : Les 5 év...,https://www.myrhline.com/actualite-rh/de-la-gp...,myrhline.com | Actualité RH et tendances des R...,xpath_source,https://www.myrhline.com/,https://www.myrhline.com/wp-content/uploads/20...,Christophe PATTE,"[Article sponsorisé, Digitalisation RH, SIRH, ..."


In [14]:
row = BG.scrap("https://www.lemonde.fr/pixels/article/2020/09/16/donnees-de-sante-nouveau-recours-contre-le-health-data-hub-devant-le-conseil-d-etat_6052464_4408996.html")

In [15]:
BG.df

Unnamed: 0,art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,art_img,art_auth,art_tag,src_img
0,"Les chatbots, ou agents relationnels, sont dep...","[\n, [[Les , <a href=""https://changethework.co...",2017-11-28,fr,Chatbot RH : l'assistant fun du recrutement,https://changethework.com/chatbot-rh-recrutement/,Change the work,xpath_source,https://changethework.com/,https://changethework.com/wp-content/uploads/2...,"[Léo Bernard, Aurélien Leleux]",,
1,"Le Health Data Hub, lancé en décembre dern...","[[ Le Health Data Hub, lancé en décembre de...",2020-09-16,fr,Données de santé: nouveau recours contre le He...,https://www.lemonde.fr/pixels/article/2020/09/...,Le Monde,xpath_source,https://www.lemonde.fr/,,no_data,no_data,no_data
