In [709]:
from bs4 import BeautifulSoup as bs
from textblob import TextBlob
import pandas as pd
import numpy as np
import datetime
import requests
import json
import re

In [656]:
df_scrap = pd.DataFrame(columns=['art_content','art_content_html','art_extract_datetime','art_lang','art_title','art_url','src_name','src_type','src_url','src_img','art_auth','art_tag']  )
df_scrap

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag


In [710]:
#url="https://www.cnrtl.fr/lexicographie/ontologie"
url = "https://www.linguee.fr/anglais-francais/traduction/blob.html"

In [711]:
def getSoup(url):
    
    response = requests.get(url)
    print("statut de la reponse ", response.status_code)
    print('\n')
    soup = bs(response.text, 'html.parser')
    print(soup)
    
    return soup

In [712]:
soup = getSoup(url)

statut de la reponse  200


<!DOCTYPE html>

<html il_fr="" il_fr'="">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content='De très nombreux exemples de phrases traduites contenant "blob" – Dictionnaire français-anglais et moteur de recherche de traductions françaises.' name="description"/>
<meta content="blob, Linguee, anglais, dictionnaire, traductions, français, dictionnaire anglais, moteur de recherche, traduction" name="keywords"/>
<meta content="fr" name="language"/>
<meta content="fr" http-equiv="content-language"/>
<meta content="https://d1wigddrwdtsce.cloudfront.net/img5/ogimage.jpg" name="og:image"/>
<meta content="image/jpeg" property="og:image:type">
<meta content="1200" property="og:image:width">
<meta content="630" property="og:image:height">
<meta content="Linguee.fr" property="og:site_name"/>
<meta content="blob - Traduction française – Linguee" property="title"/>
<meta content="blob - Traduction française – Linguee" property="og:ti

In [660]:
def get_content(url):
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    list_balises = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'strong', 'i', 'em', 'pre', 'mark', 'small', 'del', 's', 'ins', 'u', 'sub', 'sup', 'dfn', 'p']
    if soup.find('article') is not None:
        if 'p' in [tag.name for tag in soup.find('article').contents]:
            content_html = soup.find('article')
            content = '\n'.join(tag.text for tag in content_html.contents if tag.name in list_balises).replace('\xa0', '')
        else:
            list_div = list()
            for el in soup.find_all('div'):
                if el.find('p') and not el.find('div'):
                    list_div.append(el)
            index_max = np.argmax([len(block.find_all('p')) for block in list_div])
            content_html = list_div[index_max]
            content = '\n'.join(tag.text for tag in content_html.contents if tag.name in list_balises).replace('\xa0', '')
    else:
        list_div = list()
        for el in soup.find_all('div'):
            if el.find('p') and not el.find('div'):
                list_div.append(el)
        index_max = np.argmax([len(block.find_all('p')) for block in list_div])
        content_html = list_div[index_max]
        content = '\n'.join(tag.text for tag in content_html.contents if tag.name in list_balises).replace('\xa0', '')
    return content_html, content


In [720]:
def getDate(soup):
    
    """ 
        Extracting the date of publishing / updating of the article
    """
        
    if soup.find('meta', property='article:modified_time'):
        date = soup.find('meta', property='article:modified_time').get('content')
        art_published_datetime = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z').date()
        
    elif soup.find('meta', property='article:published_time'):
        date = soup.fond('meta', property='article:published_time').get('content')
        art_published_datetime = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z').date()
        
    elif soup.find('time', class_='datetime'):
        date = soup.find('time', class_='datetime').text
        art_published_datetime = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%S%z').date()

    else :
        art_published_datetime = datetime.date.today()
        
    return art_published_datetime


In [718]:
def getLang(soup, content):
    
    """ 
        Retrieval of the article language
    """
    
    blob = TextBlob(str(content)) #Art content without html
    
    return blob.detect_language()


In [719]:
getLang(soup, content)

'fr'

In [665]:
def getTitle(soup):
    
    """ 
        Retrieval of the article title
    """
    
    if soup.find('meta', property = "og:title"):
        art_title = soup.find('meta', property = "og:title").get('content')
        
    elif soup.find('title'):
        art_title = soup.find('title').get_text()
        
    else :
        art_title = "no_data"

    return art_title

In [667]:
def getUrl(soup, url):
    
    """ 
        Retrieval of the article url
    """
    
    if soup.find('meta', property = "og:url"):
        art_url = soup.find('meta', property = "og:url").get('content')
        
    elif soup.find('link', rel = 'canonical'):
        art_url = soup.find('link', rel = 'canonical').get('href')
        
    else :
        art_url = url
        
    return art_url

In [669]:
def getSourceUrl(soup):
    
    """ 
        Article's source url
    """
    for val in re.finditer("(\w)+://[^/]+/", url):
        return val.group(0)

In [703]:
def getSourceName(soup, url):
    
    """ 
        Extracting article's source name
    """
        
    if soup.find('meta', property="og:site_name"):
        src_name = soup.find('meta', property="og:site_name").get('content')
    else :
        try :
            site = url.split("://")

            if site[0] == "https" or site[0] == "http":
                name_site = site[1]
            else:
                name_site = site[0]

            tab = name_site.split("/")
            name_site = tab[0]


            TLD = ["fr.","www.","www2.",".org",".fr",".eu",".net",".com"]

            for i in TLD:
                name_site = name_site.replace(i, "")
            if name_site[-1] =='.' :
                return name_site[:-1]
            else :
                return (name_site)
        except :
            return url
        
    return src_name

In [673]:
def getSourceType(soup):
    
    """ 
        Extracting article's source type
    """
    
    src_type = "xpath_source"
    
    return src_type

In [674]:
def getImg(soup):
    
    """ 
        Extracting article's image
    """
    if soup.find('meta', property = "og:image"):
        src_img = soup.find('meta', property = "og:image").get('content')
        
    else :
        src_img = "no_data"
        
    return src_img

In [676]:
def getArtAuth(soup):
    
    """ 
        Extracting article's author
    """
    if soup.find('meta', attrs={'name':"twitter:data1"}) :
        art_auth = soup.find('meta', attrs={'name':"twitter:data1"}).get('content')
        
    elif soup.find('div', class_ = "td-post-author-name") :
        art_auth = find('div', class_ = "td-post-author-name").text
        
    elif soup.find("a", {"class": "nomAuteur"}):
        art_auth = soup.find("a", {"class": "nomAuteur"}).get('content')
        
    else :
        art_auth = "no_data"
    
    return art_auth

In [678]:
def getTag(soup):
    
    ##keywords or tags
    """
        Extracting article's tags if they exist
    """
    if soup.find('meta', attrs={'name':"keywords"}):
        art_tag = soup.find('meta', attrs={'name':"keywords"}).get('content')
        
    elif soup.find_all("a", {"rel": "category tag"}) :
        art_tag = [el.get_text() for el in soup.find_all("a", {"rel": "category tag"})]
        
    else:
        art_tag = "no_data"
        
    return art_tag

In [705]:
def scrapPage(soup):
    
    """ 
        Function combining the previous ones and stocking the list of results in a dataframe line
    """
    
    art_content, art_content_html = getContent(soup)
    
    art_extract_datetime = getDate(soup)
    
    art_lang = getLang(soup, art_content)
    
    art_title = getTitle(soup)
    
    art_url = getUrl(soup, url)
    
    src_name = getSourceName(soup, url)
    
    src_type = "xpath_source"
    
    src_url = getSourceUrl(soup)
    
    src_img = getImg(soup) 
    
    art_auth = getArtAuth(soup)
    
    art_tag = getTag(soup)
    
    return art_content, art_content_html, art_extract_datetime, art_lang, art_title, art_url, src_name, src_type, src_url, src_img, art_auth, art_tag

In [706]:
df_scrap.loc[1]=scrapPage(soup)

In [708]:
df_scrap

Unnamed: 0,art_content,art_content_html,art_extract_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,src_img,art_auth,art_tag
1,\n\nLe World Wide Web [ˌwɝld waɪd ˈwɛb][a] (li...,"<div class=""mw-parser-output""><div class=""band...",2021-01-11,fr,World Wide Web — Wikipédia,https://fr.wikipedia.org/wiki/World_Wide_Web,geeksforgeeks,xpath_source,https://www.geeksforgeeks.org/,https://upload.wikimedia.org/wikipedia/commons...,no_data,no_data
