In [15]:
import os  # helper functions like check file exists
import datetime  # automatic file name
import requests  # the following imports are common web scraping bundle
from urllib.request import urlopen  # standard python module
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from collections import defaultdict
import re
from urllib.error import URLError
from tqdm import tqdm
import pickle
import bz2
import pandas as pd
from collections import Counter
from urllib.parse import urlparse, unquote

from datetime import datetime, timedelta


In [16]:
def extract_theme(link):
    try:
        theme_text = re.findall(r'.fr/.*?/', link)[0]
    except:
        pass
    else:
        return theme_text[4:-1]

def extract_name_from_https_link(link):
    """
    Extracts the domain name (name) from an HTTPS link.

    :param link: A string containing the HTTPS link.
    :return: The domain name (e.g., 'www.google.com') or None if no match is found.
    """
    # Regular expression to match the domain name in an HTTPS link
    match = re.search(r'https://([^/\s]+)', link)
    if match:
        return match.group(1)  # Return the domain name
    return None  # Return None if no match is found

def extract_subname(url):
    """Extracts the subname (article slug) from a given HTML link."""
    path = urlparse(url).path  # Extract the path from the URL
    filename = os.path.basename(path)  # Get the last part of the path
    subname = os.path.splitext(filename)[0]  # Remove the .html extension
    return unquote(subname)  # Decode any URL-encoded characters


def extract_clean_subname(url):
    subname = extract_subname(url)
    return re.sub(r'(_\d+)+$', '', subname)  # Remove all trailing underscores followed by numbers


def get_filename(filepath):
    """Extracts the filename from a given file path."""
    return os.path.basename(filepath)


def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return f"An error occurred: {e}"


def list_themes(links):
    themes = []
    for link in links:
        theme = extract_theme(link)
        if theme is not None:
            themes.append(theme)
    return themes


def write_links(path, links, year_fn):
    with open(os.path.join(path + "/lemonde_" + str(year_fn) + "_links.txt"), 'w', encoding="utf-8") as f:
        for link in links:
            f.write(link + "\n")


def write_to_file(filename, content):
    if os.path.exists(filename):
        with open(filename, 'a+', encoding="utf-8") as f:
            f.write(str(content))
    else:
        with open(filename, 'w', encoding="utf-8") as f:
            f.write(str(content))


def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end):
    archive_links = {}
    for y in range(year_start, year_end + 1):
        dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" +
                    str(y) for m in range(month_start, month_end + 1) for d in
                    range(day_start, day_end + 1)]
        archive_links[y] = [
            "https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
    return archive_links


def get_articles_links(archive_links):
    links_non_abonne = []
    for link in archive_links:
        try:
            html = urlopen(link)
        except HTTPError as e:
            print("url not valid", link)
        else:
            soup = BeautifulSoup(html, "html.parser")
            news = soup.find_all(class_="teaser")
            # condition here : if no span icon__premium (abonnes)
            for item in news:
                if not item.find('span', {'class': 'icon__premium'}):
                    l_article = item.find('a')['href']
                    # en-direct = video
                    if 'en-direct' not in l_article:
                        links_non_abonne.append(l_article)
    return links_non_abonne


def classify_links(theme_list, link_list):
    dict_links = defaultdict(list)
    for theme in theme_list:
        theme_link = 'https://www.lemonde.fr/' + theme + '/article/'
        for link in link_list:
            if theme_link in link:
                dict_links[theme].append(link)
    return dict_links


def get_single_page(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print("url not valid", url)
    else:
        soup = BeautifulSoup(html, "html.parser")
        text_title = soup.find('h1')
        text_body = soup.article.find_all(["p", "h2"], recursive=False)
        return (text_title, text_body)


def scrape_articles(dict_links):
    themes = dict_links.keys()
    for theme in themes:
        create_folder(os.path.join('corpus', theme))
        print("processing:", theme)
        for i in tqdm(range(len(dict_links[theme]))):
            link = dict_links[theme][i]
            fn = extract_clean_subname(link)
            single_page = get_single_page(link)
            if single_page is not None:
                with open((os.path.join('corpus', theme, fn + '.txt')), 'w', encoding="utf-8") as f:
                    # f.write(dict_links[theme][i] + "\n" * 2)
                    f.write(single_page[0].get_text() + "\n")
                    for line in single_page[1]:
                        f.write(line.get_text() + "\n")


def cr_corpus_dict(path_corpus, n_files=1000):
    dict_corpus = defaultdict(list)
    themes = os.listdir(path_corpus)
    for theme in themes:
        counter = 0
        if not theme.startswith('.'):
            theme_directory = os.path.join(path_corpus, theme)
            for file in os.listdir(theme_directory):
                if counter < n_files:
                    path_file = os.path.join(theme_directory, file)
                    text = read_file(path_file)
                    dict_corpus["label"].append(theme)
                    dict_corpus["text"].append(text)
                counter += 1
    return dict_corpus


def create_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)
    else:
        print("folder exists already")

In [17]:

# Get today's date
today = datetime.today()
yesterday = today - timedelta(days=1)

# Print the results

archive_links = create_archive_links(yesterday.year, today.year, 
                                        yesterday.month, today.month, 
                                        yesterday.day, today.day)
archive_links

{2025: ['https://www.lemonde.fr/archives-du-monde/09-02-2025/',
  'https://www.lemonde.fr/archives-du-monde/10-02-2025/']}

In [18]:
corpus_path = os.path.join(os.getcwd(), "corpus_links")
create_folder(corpus_path)

folder exists already


In [19]:
article_links = {}
for year,links in archive_links.items():
    print("processing: ",year)
    article_links_list = get_articles_links(links)
    article_links[year] = article_links_list
    write_links(corpus_path,article_links_list,year)

processing:  2025


In [20]:
themes = []
for link_list in article_links.values():
    themes.extend(list_themes(link_list))
print(len(themes))
themes

24


['international',
 'pixels',
 'politique',
 'politique',
 'afrique',
 'sport',
 'les-decodeurs',
 'societe',
 'sport',
 'culture',
 'international',
 'politique',
 'societe',
 'international',
 'societe',
 'idees',
 'politique',
 'afrique',
 'sport',
 'afrique',
 'politique',
 'm-styles',
 'economie',
 'sport']

In [21]:
theme_stat = Counter(themes)
theme_top = []
for k,v in sorted(theme_stat.items(), key = lambda x:x[1], reverse=True):
    #if v > 700:
    theme_top.append((k, v))
print(theme_top)

[('politique', 5), ('sport', 4), ('international', 3), ('afrique', 3), ('societe', 3), ('pixels', 1), ('les-decodeurs', 1), ('culture', 1), ('idees', 1), ('m-styles', 1), ('economie', 1)]


In [22]:
all_links = []
for link_list in article_links.values():
    all_links.extend(link_list)

themes_top_five = [x[0] for x in theme_top]

themes_top_five_links = classify_links(themes_top_five,all_links)
themes_top_five_links

defaultdict(list,
            {'politique': ['https://www.lemonde.fr/politique/article/2025/02/09/gerald-darmanin-estime-que-pour-2027-il-faudra-bien-un-processus-de-selection-et-peut-etre-une-primaire-dans-le-camp-d-emmanuel-macron_6539082_823448.html',
              'https://www.lemonde.fr/politique/article/2025/02/09/gerald-darmanin-propose-d-accelerer-les-procedures-d-expulsion-en-supprimant-une-commission-ad-hoc_6539049_823448.html',
              'https://www.lemonde.fr/politique/article/2025/02/09/droit-du-sol-le-ministre-de-l-economie-eric-lombard-estime-qu-il-n-y-a-pas-de-raison-de-durcir-les-regles_6538701_823448.html',
              'https://www.lemonde.fr/politique/article/2025/02/10/le-maire-de-frejus-convoque-le-30-septembre-au-tribunal-pour-prise-illegale-d-interets_6540064_823448.html',
              'https://www.lemonde.fr/politique/article/2025/02/10/a-boulogne-billancourt-la-candidate-des-republicains-elisabeth-de-maistre-elue-deputee_6539760_823448.html'],
         

In [23]:
first_links = next(iter(themes_top_five_links.values()))  # Gets the first value (list)
first_links[0]

'https://www.lemonde.fr/politique/article/2025/02/09/gerald-darmanin-estime-que-pour-2027-il-faudra-bien-un-processus-de-selection-et-peut-etre-une-primaire-dans-le-camp-d-emmanuel-macron_6539082_823448.html'

In [24]:
url = first_links[0]
try:
    html = urlopen(url)
except HTTPError as e:
    print("url not valid", url)
else:
    soup = BeautifulSoup(html, "html.parser")
with open('soup.txt', 'w', encoding="utf-8") as f:
    f.write(str(soup))

In [25]:
tags = soup.find_all(['tags'], recursive=False)
tags

[]

In [26]:
paragraphs = soup.article.find_all(["p", "h2"], recursive=False)
paragraphs

[<p class="article__paragraph"><em>« Si jamais personne ne se dégage – c’est ma volonté que quelqu’un se dégage –, il faudra bien un processus de sélection. Ou alors on sera plusieurs candidats et donc on </em>[ne] <em>sera pas au second tour : on sera sans doute très contents d’être soi-même, mais on sera tout seul et les Français nous en voudront énormément »</em>, a déclaré dimanche 8 février Gérald Darmanin. Sur BFM-TV, l’actuel garde des sceaux répondait à des questions au sujet de l’élection présidentielle de 2027.</p>,
 <p class="article__paragraph">M. Darmanin a aussi estimé que, pour la prochaine présidentielle, <em>« l’important, c’est le projet, pour quoi faire »</em>. <em>« Après, il faudra peut-être qu’on réfléchisse à comment on sélectionne notre candidat parce qu’il n’en faut qu’un seul »</em>, a-t-il prévenu, <em>« sinon on va avoir M<sup>me</sup> Le Pen contre M. Mélenchon au second tour de l’élection présidentielle »</em>. <em>« Et ce sera M<sup>me</sup> Le Pen </em>[

In [27]:
create_folder('corpus')

scrape_articles(themes_top_five_links)


folder exists already
processing: politique


100%|██████████| 5/5 [00:01<00:00,  4.58it/s]


processing: sport


100%|██████████| 4/4 [00:00<00:00,  4.97it/s]


processing: international


100%|██████████| 3/3 [00:00<00:00,  4.37it/s]


processing: afrique


100%|██████████| 3/3 [00:00<00:00,  4.87it/s]


processing: societe


100%|██████████| 3/3 [00:00<00:00,  5.21it/s]


processing: pixels


100%|██████████| 1/1 [00:00<00:00,  5.05it/s]


processing: les-decodeurs


100%|██████████| 1/1 [00:00<00:00,  4.90it/s]


processing: culture


100%|██████████| 1/1 [00:00<00:00,  4.43it/s]


processing: idees


100%|██████████| 1/1 [00:00<00:00,  3.57it/s]


processing: m-styles


100%|██████████| 1/1 [00:00<00:00,  5.36it/s]


processing: economie


100%|██████████| 1/1 [00:00<00:00,  5.04it/s]


In [28]:
path = 'corpus'
dico_corpus = cr_corpus_dict(path,1000)
dico_corpus

defaultdict(list,
            {'label': ['afrique',
              'afrique',
              'afrique',
              'culture',
              'economie',
              'idees',
              'international',
              'international',
              'international',
              'les-decodeurs',
              'm-styles',
              'pixels',
              'politique',
              'politique',
              'politique',
              'politique',
              'politique',
              'societe',
              'societe',
              'societe',
              'sport',
              'sport',
              'sport',
              'sport'],
             'text': ['Au Niger, des Â«Â\xa0assises nationalesÂ\xa0Â» censÃ©es fixer la durÃ©e de la transition\nLe rÃ©gime militaire au pouvoir au Niger va organiser du 15 au 19Â\xa0fÃ©vrier des Â«Â\xa0assises nationalesÂ\xa0Â», censÃ©es notamment fixer la durÃ©e de la transition ouverte aprÃ¨s le renversement du prÃ©sident civil Mohamed Bazoum