In [1]:
import requests
import csv
import pandas as pd
import numpy as np
import datetime
from textblob import TextBlob 
from bs4 import BeautifulSoup
import re
import time

In [2]:
url = "https://www.usine-digitale.fr/article/le-specialiste-du-jeu-video-niantic-acquiert-la-plateforme-de-jeu-social-mayhem.N1046614"

In [3]:
df= pd.DataFrame(columns=['art_content','art_content_html','art_published_datetime','art_lang','art_title','art_url','src_name','src_type','src_url','art_img','art_auth','art_tag']  )
df

Unnamed: 0,art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,art_img,art_auth,art_tag


In [4]:
dummy_date = datetime.date(2019, 12, 31)

In [5]:
"""
In: url of an article
Out: url of the website
"""

def get_base_url(url):
    for val in re.finditer("(\w)+://[^/]+", url):
        return val.group(0)   
    
"""
In: url of the website
Out: list of the featured themes
"""

def find_themes(url, nb_themes = 6):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    themes = soup.find_all("a", {"class":"thematics__links"})
    theme_list = []
    for i in range(nb_themes):
        theme_list.append(url + str(themes[i]["href"])) 
    return theme_list

In [6]:
"""
In: url of a page with several articles
Out: list of the featured articles on the page
"""

def find_articles_on_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all("a", {"class":"contenu"})
    article_list = []
    for article in articles:
        article_list.append(article["href"])
    return article_list

In [7]:
"""
In: url of a page with several articles
Out: number of the last available page
"""

def last_number(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    pages = soup.find_all("li", {"class": "isNoMobile"})
    page_list = []
    for page in pages:
        page_number = page.find("a", {"class": "num"})
        if page_number is not None:
            page_number = page_number.text
            page_list.append(page_number)
    return int(page_list[-1])

In [8]:
"""
In: url of a page with several articles
Out: date of the 1st article on the page
"""

def first_art_date(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    article_dates = soup.find_all("span", {"class": "dateEtiquette2"})
    
    dates = []
    for article_date in article_dates:
        dates.append(article_date.text)
    
    return datetime.datetime.strptime(dates[0], '%d/%m/%Y - %HH%M').date()

In [9]:
"""
In: url of a page with several articles, date after which the articles have to be scraped
Out: list of pages matching the date requirement
"""

def find_all_pages(url, date):
    url_list = [url]
    last_page = last_number(url)
    i = 0 
    datum = first_art_date(url_list[i])
    
    while datum > date and i+2 <= last_page:
        new_url = url + str(i+2)+"/"
        url_list.append(new_url)
        i += 1
        datum = first_art_date(url_list[i])
        
    return url_list

In [10]:
"""
In: url of a page with several articles, date after which the articles have to be scraped
Out: list of all pages of all themes matching the date requirement
"""

def find_all_pages_all_themes(url, date):
    themes = find_themes(get_base_url(url), nb_themes = 6)
    apat = []
    for theme in themes:
        apat += find_all_pages(theme, date)
    return apat

In [11]:
"""
In: url of a page with several articles, date after which the articles have to be scraped
Out: list of all articles on all pages of all themes matching the date requirement
"""

def all_recent_articles(url, date):
    url_list = find_all_pages_all_themes(url, date)
    total_pages = len(url_list)
    article_list = []
    i = 1
    for element in url_list:
        article_list += find_articles_on_page(url)
        print("page " + str(i) + '/' + str(total_pages))
        i += 1
    return article_list

In [12]:
""" 
In: url of an article
Out: a tuple of strings containing the scraped data
"""

def get_all(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    #retrieval of the html content
    art_content_html = soup.find("article", {"class":"contenuArticle"})
    
    if art_content_html is None:
        art_content_html = "no_data"
        
    #retrieval of the article content    
    art_content = art_content_html.text
        
    if art_content is None:
        art_content = "no_data"   
    
    #retrieval of the publication/modification date
    art_published_datetime = soup.find("time", {"class": "dateEtiquette3"})
    
    if art_published_datetime is None:
        art_published_datetime = datetime.datetime.now()
    else:    
        art_published_datetime = art_published_datetime["datetime"]
        art_published_datetime = datetime.datetime.strptime(art_published_datetime, '%Y-%m-%dT%H:%M').date()

    #retrieval of the language
    art_lang = soup.find("meta", {"property": "og:locale"})
    
    if art_lang is None:
        art_lang = "no_data"
    else:
        art_lang = art_lang["content"]
    
    #retrieval of the title
    art_title = soup.find("meta", {"property": "og:title"}) 
    
    if art_title is None:
        art_title = "no_data"
    else:
        art_title = art_title["content"]
        
    #retrieval of the article url   
    art_url = soup.find("meta", {"property": "og:url"})
    
    if art_url is None:
        art_url = "no_data"
    else:    
        art_url = art_url["content"]
    
    #retrieval of the website name
    src_name = soup.find("meta", {"name": "ipd:siteName"})
    
    if src_name is None:
        src_name = "no_data"
    else:
        src_name = src_name["content"]
    
    #retrieval of the source type
    src_type = "xpath_source" #default value
    
    #retrieval of the source url
    src_url = soup.find("meta", {"property": "og:site_name"})
    
    if src_url is None:
        src_url = "no_data"
    else:    
        src_url = src_url["content"]
    
    #retrieval of the article image
    art_img = soup.find("meta", {"property": "og:image"})
    
    if art_img is None:
        art_img = "no_data" 
    else:
        art_img = art_img["content"]
    
    #retrieval of the article author
    art_auth = soup.find("a", {"class": "nomAuteur"})
    
    if art_auth is None:   
        art_auth = "no_data"
    else:
        art_auth = art_auth.text
    
    #retrieval of the article tags
    art_tags = soup.find_all("a", {"rel": "tag"})    
    if art_tags is None:
        art_tag = "no_data"
    else:
        art_tag = []
        for tag in art_tags:
            art_tag.append(str(tag.text))


    return art_content_html, art_content, art_published_datetime, art_lang, art_title,\
        art_url, src_name, src_type, src_url, art_img, art_auth, art_tag

In [13]:
"""
In: list of urls of articles
Out: dataframe with all scraped articles
"""

def scrap_all(list_urls):
    i = 1
    total_articles = len(list_urls)
    df= pd.DataFrame(columns=['art_content','art_content_html','art_published_datetime',\
                              'art_lang','art_title','art_url','src_name','src_type','src_url',\
                              'art_img','art_auth','art_tag'])
    for url in list_urls:
        df.loc[i] = get_all(url)
        print("article " + str(i) + '/' + str(total_articles))
        i += 1

    return df  

In [14]:
"""
In: url of an article, date after which the articles have to be scraped
Out: dataframe with all scraped articles
"""

def do_the_thing(url, date):
    return scrap_all(all_recent_articles(url, date))

In [15]:
df2 = do_the_thing(url, dummy_date)

page 1/89
page 2/89
page 3/89
page 4/89
page 5/89
page 6/89
page 7/89
page 8/89
page 9/89
page 10/89
page 11/89
page 12/89
page 13/89
page 14/89
page 15/89
page 16/89
page 17/89
page 18/89
page 19/89
page 20/89
page 21/89
page 22/89
page 23/89
page 24/89
page 25/89
page 26/89
page 27/89
page 28/89
page 29/89
page 30/89
page 31/89
page 32/89
page 33/89
page 34/89
page 35/89
page 36/89
page 37/89
page 38/89
page 39/89
page 40/89
page 41/89
page 42/89
page 43/89
page 44/89
page 45/89
page 46/89
page 47/89
page 48/89
page 49/89
page 50/89
page 51/89
page 52/89
page 53/89
page 54/89
page 55/89
page 56/89
page 57/89
page 58/89
page 59/89
page 60/89
page 61/89
page 62/89
page 63/89
page 64/89
page 65/89
page 66/89
page 67/89
page 68/89
page 69/89
page 70/89
page 71/89
page 72/89
page 73/89
page 74/89
page 75/89
page 76/89
page 77/89
page 78/89
page 79/89
page 80/89
page 81/89
page 82/89
page 83/89
page 84/89
page 85/89
page 86/89
page 87/89
page 88/89
page 89/89
article 1/1602
article 2/1602


article 433/1602
article 434/1602
article 435/1602
article 436/1602
article 437/1602
article 438/1602
article 439/1602
article 440/1602
article 441/1602
article 442/1602
article 443/1602
article 444/1602
article 445/1602
article 446/1602
article 447/1602
article 448/1602
article 449/1602
article 450/1602
article 451/1602
article 452/1602
article 453/1602
article 454/1602
article 455/1602
article 456/1602
article 457/1602
article 458/1602
article 459/1602
article 460/1602
article 461/1602
article 462/1602
article 463/1602
article 464/1602
article 465/1602
article 466/1602
article 467/1602
article 468/1602
article 469/1602
article 470/1602
article 471/1602
article 472/1602
article 473/1602
article 474/1602
article 475/1602
article 476/1602
article 477/1602
article 478/1602
article 479/1602
article 480/1602
article 481/1602
article 482/1602
article 483/1602
article 484/1602
article 485/1602
article 486/1602
article 487/1602
article 488/1602
article 489/1602
article 490/1602
article 491/16

article 915/1602
article 916/1602
article 917/1602
article 918/1602
article 919/1602
article 920/1602
article 921/1602
article 922/1602
article 923/1602
article 924/1602
article 925/1602
article 926/1602
article 927/1602
article 928/1602
article 929/1602
article 930/1602
article 931/1602
article 932/1602
article 933/1602
article 934/1602
article 935/1602
article 936/1602
article 937/1602
article 938/1602
article 939/1602
article 940/1602
article 941/1602
article 942/1602
article 943/1602
article 944/1602
article 945/1602
article 946/1602
article 947/1602
article 948/1602
article 949/1602
article 950/1602
article 951/1602
article 952/1602
article 953/1602
article 954/1602
article 955/1602
article 956/1602
article 957/1602
article 958/1602
article 959/1602
article 960/1602
article 961/1602
article 962/1602
article 963/1602
article 964/1602
article 965/1602
article 966/1602
article 967/1602
article 968/1602
article 969/1602
article 970/1602
article 971/1602
article 972/1602
article 973/16

article 1375/1602
article 1376/1602
article 1377/1602
article 1378/1602
article 1379/1602
article 1380/1602
article 1381/1602
article 1382/1602
article 1383/1602
article 1384/1602
article 1385/1602
article 1386/1602
article 1387/1602
article 1388/1602
article 1389/1602
article 1390/1602
article 1391/1602
article 1392/1602
article 1393/1602
article 1394/1602
article 1395/1602
article 1396/1602
article 1397/1602
article 1398/1602
article 1399/1602
article 1400/1602
article 1401/1602
article 1402/1602
article 1403/1602
article 1404/1602
article 1405/1602
article 1406/1602
article 1407/1602
article 1408/1602
article 1409/1602
article 1410/1602
article 1411/1602
article 1412/1602
article 1413/1602
article 1414/1602
article 1415/1602
article 1416/1602
article 1417/1602
article 1418/1602
article 1419/1602
article 1420/1602
article 1421/1602
article 1422/1602
article 1423/1602
article 1424/1602
article 1425/1602
article 1426/1602
article 1427/1602
article 1428/1602
article 1429/1602
article 14

In [16]:
df2

Unnamed: 0,art_content,art_content_html,art_published_datetime,art_lang,art_title,art_url,src_name,src_type,src_url,art_img,art_auth,art_tag
1,"[\n, [\n, [], \n, [Niantic acquiert la start-u...","\n\n\nNiantic acquiert la start-up 6D.ai, spéc...",2020-03-31,fr_FR,"Niantic acquiert la start-up 6D.ai, spécialist...",https://www.usine-digitale.fr/article/niantic-...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/6/1/...,Arthur Le Denn,"[Acquisition, Réalité augmentée, Start-up]"
2,"[\n, [\n, [], \n, [Niantic lève 245 millions d...",\n\n\nNiantic lève 245 millions de dollars pou...,2019-01-21,fr_FR,Niantic lève 245 millions de dollars pour déve...,https://www.usine-digitale.fr/article/niantic-...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/5/5/...,Julien Bergounhoux,"[Réalité augmentée, Start-up, Innovation]"
3,"[\n, [\n, [], \n, [Twitch, Mixer, YouTube... L...","\n\n\nTwitch, Mixer, YouTube... La guerre du s...",2019-11-21,fr_FR,"Twitch, Mixer, YouTube... La guerre du streami...",https://www.usine-digitale.fr/article/twitch-m...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/9/6/...,Fabrice Deblock,"[Jeux Video, Streaming, Twitch]"
4,"[\n, [\n, [], \n, [[CES 2021] Comment Business...",\n\n\n[CES 2021] Comment Business France a rep...,2021-01-08,fr_FR,[CES 2021] Comment Business France a repensé s...,https://www.usine-digitale.fr/article/ces-2021...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/9/4/...,Léna Corot,"[CES 2021, IoT, Start-up]"
5,"[\n, [\n, [], \n, [L'autorité de la concurrenc...",\n\n\nL'autorité de la concurrence britannique...,2021-01-08,fr_FR,L'autorité de la concurrence britannique va en...,https://www.usine-digitale.fr/article/l-autori...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/4/8/...,Alice Vitard,"[Google, Marketing, Gestion des données]"
...,...,...,...,...,...,...,...,...,...,...,...,...
1598,"[\n, [\n, [], \n, [\n, <span class=""navDossier...",\n\n\n\n\n La nécessaire prise de conscience p...,2020-12-14,fr_FR,La nécessaire prise de conscience par les vill...,https://www.usine-digitale.fr/article/la-neces...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/1/1/...,no_data,"[Villes du futur, VILLE DU FUTUR, VILLE SÛRE !..."
1599,"[\n, [\n, [], \n, [\n, <span class=""navDossier...",\n\n\n\n\n Surveillance : Les smart cities ne ...,2020-12-14,fr_FR,Surveillance : Les smart cities ne doivent pas...,https://www.usine-digitale.fr/article/surveill...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/1/7/...,no_data,"[Villes du futur, Intelligence artificielle, B..."
1600,"[\n, [\n, [], \n, [Looking Glass Factory propo...",\n\n\nLooking Glass Factory propose aux partic...,2021-01-08,fr_FR,Looking Glass Factory propose aux particuliers...,https://www.usine-digitale.fr/article/looking-...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/4/9/...,Léna Corot,"[3D, Hardware, Start-up]"
1601,"[\n, [\n, [], \n, [Volkswagen démocratise l'af...",\n\n\nVolkswagen démocratise l'affichage en ré...,2021-01-05,fr_FR,Volkswagen démocratise l'affichage en réalité ...,https://www.usine-digitale.fr/article/volkswag...,L'Usine Digitale,xpath_source,usine-digitale.fr,https://www.usine-digitale.fr/mediatheque/9/7/...,Léna Corot,"[Automobile, Réalité augmentée, Volkswagen]"
