In [109]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import json
import datetime
import re

In [110]:
df_scrap = pd.DataFrame(columns=['art_content','art_content_html','art_published_datetime','art_lang','art_title','art_url','src_name','src_type','src_url','art_img','art_auth','art_tag']  )
df_scrap

Unnamed: 0,art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,art_img,art_auth,art_tag


In [111]:

url = "https://blockchainfrance.net/decouvrir-la-blockchain/c-est-quoi-la-blockchain/"


In [112]:
def getSoup(url):
    
    response = requests.get(url)
    print("statut de la reponse ", response.status_code)
    print('\n')
    soup = bs(response.text, 'html.parser')
    print(soup)
    
    return soup

In [113]:
soup = getSoup(url)

statut de la reponse  200


<!DOCTYPE html>

<html lang="fr-FR">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<title>Qu’est-ce que la blockchain ?</title>
<script type="text/javascript">
  WebFontConfig = {"google":{"families":["Poppins:r:latin,latin-ext","Poppins:r,i,b,bi:latin,latin-ext"]}};
  (function() {
    var wf = document.createElement('script');
    wf.src = 'https://s0.wp.com/wp-content/plugins/custom-fonts/js/webfont.js';
    wf.type = 'text/javascript';
    wf.async = 'true';
    var s = document.getElementsByTagName('script')[0];
    s.parentNode.insertBefore(wf, s);
	})();
</script><style id="jetpack-custom-fonts-css">.wf-active body, .wf-active button, .wf-active input, .wf-active select, .wf-active textarea{font-family:"Poppins",sans-serif}.wf-active h1, .wf-active h2, .wf-active h3, .wf-active h4, .wf-active h5, .wf-active h6{font-family:"Poppins",sans-serif;font-weight:400;font-style:normal}.wf-active h1{font-st

In [114]:
def getContent(soup):
    
    """ 
        Extracting the content of the article, whether in html or not
    """
    art_content_html = soup.find_all('div', class_="site-content")
    art_content = " ".join([x.text for x in art_content_html])
    #art_content = art_content.replace("\n", "").replace("\xa0", "")

    return art_content, art_content_html

In [115]:
getContent(soup)

("\n\n\nQu’est-ce que la blockchain\xa0? \n\n\n\n\n\n\nDéfinition et explication\n\nLa blockchain est une technologie de stockage et de transmission d’informations, transparente, sécurisée, et fonctionnant sans organe central de contrôle (définition de Blockchain France). Elle est la technologie au coeur\xa0 du Web Decentralisé et de son corollaire, la finance décentralisée. \nPar extension, une blockchain constitue une base de données qui contient l’historique de tous les échanges effectués entre ses utilisateurs depuis sa création. Cette base de données est sécurisée et distribuée : elle est partagée par ses différents utilisateurs, sans intermédiaire, ce qui permet à chacun de vérifier la validité de la chaîne.Il existe des blockchains publiques, ouvertes à tous, et des blockchains privées, dont l’accès et l’utilisation sont limitées à un certain nombre d’acteurs.\n\nUne blockchain publique peut donc être assimilée à un grand livre comptable public, anonyme et infalsifiable. Comme l

In [116]:
def getDate(soup):
    
    """ 
        Extracting the date of publishing / updating of the article
    """
    # Getting date of a video published on the article because it's the only date i found that can fit
    date = soup.find('meta', property='article:modified_time').get('content')
    art_extract_datetime = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z').date()
    
    return art_extract_datetime

In [117]:
getDate(soup)

datetime.date(2020, 9, 4)

In [118]:
def getLang(soup):
    
    """ 
        Extracting the language the article is written in
    """
    
    art_lang = soup.find('meta', attrs={'property':"og:locale"}).get('content')

    return art_lang

In [119]:
def getTitle(soup):
    
    """ 
        Extracting the title of the article
    """
    
    art_title = soup.find('meta', property = "og:title").get('content')
    
    return art_title

In [120]:
def getUrl(soup):
    
    """ 
        Extracting the url of the article
    """
    
    art_url = soup.find('meta', property = "og:url").get('content')
    
    return art_url

In [121]:
def getSourceName(soup):
    
    """ 
        Extracting article's source name
    """
    
    #src_name = json.loads(soup.find('script', id = 'wpcom-actionbar-placeholder-js-extra')).get('var').get_text()['siteName']
    src_name = getSourceUrl(soup).replace('https://','').replace('/','')
    
    return src_name

In [122]:
getSourceName(soup)

'blockchainfrance.net'

In [123]:
def getSourceType(soup):
    
    """ 
        Extracting article's source type
    """
    
    src_type = "xpath_source"
    
    return src_type

In [124]:
def getSourceUrl(soup):
    
    """ 
        Article's source url
    """
    for val in re.finditer("(\w)+://[^/]+/", url):
        return val.group(0)

In [125]:
getSourceUrl(soup)

'https://blockchainfrance.net/'

In [126]:
def getImg(soup):
    
    """ 
        Extracting article's image
    """
    
    src_img = soup.find('meta', property = "og:image").get('content')
    
    return src_img

In [127]:
def getArtAuth(soup):
    
    """ 
        Extracting article's author
    """
    
    #art_auth = soup.find('meta', attrs={'name':"twitter:data1"}).get('content')
    art_auth = "no_data"
    
    return art_auth

In [128]:
def getTag(soup):
    
    """ 
        Extracting article's tags if they exist
    """
    
    art_tag = soup.find('meta', attrs={'name':"keywords"})
    if art_tag != None:
        art_tag = art_tag.get('content')
        
    return art_tag

In [129]:
def scrapPage(soup):
    
    """ 
        Function combining the previous ones and stocking the list of results in a dataframe line
    """
    
    art_content, art_content_html = getContent(soup)
    
    art_extract_datetime = getDate(soup)
    
    art_lang = getLang(soup)
    
    art_title = getTitle(soup)
    
    art_url = getUrl(soup)
    
    src_name = getSourceName(soup)
    
    src_type = "xpath_source"
    
    src_url = getSourceUrl(soup)
    
    src_img = getImg(soup) 
    
    art_auth = getArtAuth(soup)
    
    art_tag = getTag(soup)
    
    return art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag

In [130]:
df_scrap.loc[1]=scrapPage(soup)

In [131]:
df_scrap

Unnamed: 0,art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,art_img,art_auth,art_tag
1,\n\n\nQu’est-ce que la blockchain ? \n\n\n\n\n...,"[[\n, [\n, <div class=""container th-stack--sm""...",2020-09-04,fr_FR,Qu’est-ce que la blockchain ?,https://blockchainfrance.net/decouvrir-la-bloc...,blockchainfrance.net,xpath_source,https://blockchainfrance.net/,https://blockchainfrance.files.wordpress.com/2...,no_data,
