In [111]:
import os  # helper functions like check file exists
import datetime  # automatic file name
import requests  # the following imports are common web scraping bundle
from urllib.request import urlopen  # standard python module
from bs4 import BeautifulSoup
from urllib.error import HTTPError
from collections import defaultdict
import re
from urllib.error import URLError
from tqdm import tqdm
import pickle
import bz2
import pandas as pd
from collections import Counter
from urllib.parse import urlparse, unquote


In [112]:
def extract_theme(link):
    try:
        theme_text = re.findall(r'.fr/.*?/', link)[0]
    except:
        pass
    else:
        return theme_text[4:-1]

def extract_name_from_https_link(link):
    """
    Extracts the domain name (name) from an HTTPS link.

    :param link: A string containing the HTTPS link.
    :return: The domain name (e.g., 'www.google.com') or None if no match is found.
    """
    # Regular expression to match the domain name in an HTTPS link
    match = re.search(r'https://([^/\s]+)', link)
    if match:
        return match.group(1)  # Return the domain name
    return None  # Return None if no match is found

def extract_subname(url):
    """Extracts the subname (article slug) from a given HTML link."""
    path = urlparse(url).path  # Extract the path from the URL
    filename = os.path.basename(path)  # Get the last part of the path
    subname = os.path.splitext(filename)[0]  # Remove the .html extension
    return unquote(subname)  # Decode any URL-encoded characters


def extract_clean_subname(url):
    subname = extract_subname(url)
    return re.sub(r'(_\d+)+$', '', subname)  # Remove all trailing underscores followed by numbers


def get_filename(filepath):
    """Extracts the filename from a given file path."""
    return os.path.basename(filepath)


def read_file(file_path):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return f"An error occurred: {e}"


def list_themes(links):
    themes = []
    for link in links:
        theme = extract_theme(link)
        if theme is not None:
            themes.append(theme)
    return themes


def write_links(path, links, year_fn):
    with open(os.path.join(path + "/lemonde_" + str(year_fn) + "_links.txt"), 'w', encoding="utf-8") as f:
        for link in links:
            f.write(link + "\n")


def write_to_file(filename, content):
    if os.path.exists(filename):
        with open(filename, 'a+', encoding="utf-8") as f:
            f.write(str(content))
    else:
        with open(filename, 'w', encoding="utf-8") as f:
            f.write(str(content))


def create_archive_links(year_start, year_end, month_start, month_end, day_start, day_end):
    archive_links = {}
    for y in range(year_start, year_end + 1):
        dates = [str(d).zfill(2) + "-" + str(m).zfill(2) + "-" +
                    str(y) for m in range(month_start, month_end + 1) for d in
                    range(day_start, day_end + 1)]
        archive_links[y] = [
            "https://www.lemonde.fr/archives-du-monde/" + date + "/" for date in dates]
    return archive_links


def get_articles_links(archive_links):
    links_non_abonne = []
    for link in archive_links:
        try:
            html = urlopen(link)
        except HTTPError as e:
            print("url not valid", link)
        else:
            soup = BeautifulSoup(html, "html.parser")
            news = soup.find_all(class_="teaser")
            # condition here : if no span icon__premium (abonnes)
            for item in news:
                if not item.find('span', {'class': 'icon__premium'}):
                    l_article = item.find('a')['href']
                    # en-direct = video
                    if 'en-direct' not in l_article:
                        links_non_abonne.append(l_article)
    return links_non_abonne


def classify_links(theme_list, link_list):
    dict_links = defaultdict(list)
    for theme in theme_list:
        theme_link = 'https://www.lemonde.fr/' + theme + '/article/'
        for link in link_list:
            if theme_link in link:
                dict_links[theme].append(link)
    return dict_links


def get_single_page(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        print("url not valid", url)
    else:
        soup = BeautifulSoup(html, "html.parser")
        text_title = soup.find('h1')
        text_body = soup.article.find_all(["p", "h2"], recursive=False)
        return (text_title, text_body)


def scrape_articles(dict_links):
    themes = dict_links.keys()
    for theme in themes:
        create_folder(os.path.join('corpus', theme))
        print("processing:", theme)
        for i in tqdm(range(len(dict_links[theme]))):
            link = dict_links[theme][i]
            fn = extract_clean_subname(link)
            single_page = get_single_page(link)
            if single_page is not None:
                with open((os.path.join('corpus', theme, fn + '.txt')), 'w', encoding="utf-8") as f:
                    # f.write(dict_links[theme][i] + "\n" * 2)
                    f.write(single_page[0].get_text() + "\n")
                    for line in single_page[1]:
                        f.write(line.get_text() + "\n")


def cr_corpus_dict(path_corpus, n_files=1000):
    dict_corpus = defaultdict(list)
    themes = os.listdir(path_corpus)
    for theme in themes:
        counter = 0
        if not theme.startswith('.'):
            theme_directory = os.path.join(path_corpus, theme)
            for file in os.listdir(theme_directory):
                if counter < n_files:
                    path_file = os.path.join(theme_directory, file)
                    text = read_file(path_file)
                    dict_corpus["label"].append(theme)
                    dict_corpus["text"].append(text)
                counter += 1
    return dict_corpus


def create_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)
    else:
        print("folder exists already")

In [113]:
archive_links = create_archive_links(2025, 2025, 1, 2, 2, 4)
archive_links

{2025: ['https://www.lemonde.fr/archives-du-monde/02-01-2025/',
  'https://www.lemonde.fr/archives-du-monde/03-01-2025/',
  'https://www.lemonde.fr/archives-du-monde/04-01-2025/',
  'https://www.lemonde.fr/archives-du-monde/02-02-2025/',
  'https://www.lemonde.fr/archives-du-monde/03-02-2025/',
  'https://www.lemonde.fr/archives-du-monde/04-02-2025/']}

In [114]:
corpus_path = os.path.join(os.getcwd(), "corpus_links")
create_folder(corpus_path)

folder exists already


In [115]:
article_links = {}
for year,links in archive_links.items():
    print("processing: ",year)
    article_links_list = get_articles_links(links)
    article_links[year] = article_links_list
    write_links(corpus_path,article_links_list,year)

processing:  2025


In [116]:
themes = []
for link_list in article_links.values():
    themes.extend(list_themes(link_list))
print(len(themes))
themes

114


['economie',
 'sciences',
 'international',
 'international',
 'economie',
 'international',
 'culture',
 'sport',
 'international',
 'afrique',
 'international',
 'international',
 'afrique',
 'outre-mer',
 'societe',
 'sport',
 'afrique',
 'idees',
 'afrique',
 'societe',
 'afrique',
 'afrique',
 'international',
 'international',
 'm-styles',
 'pixels',
 'international',
 'culture',
 'international',
 'societe',
 'sport',
 'afrique',
 'sport',
 'international',
 'societe',
 'politique',
 'm-styles',
 'pixels',
 'international',
 'afrique',
 'les-recettes-du-monde',
 'disparitions',
 'afrique',
 'international',
 'international',
 'societe',
 'culture',
 'mondephilatelique',
 'international',
 'sport',
 'sport',
 'outre-mer',
 'afrique',
 'idees',
 'disparitions',
 'international',
 'idees',
 'international',
 'le-monde-passe-a-table',
 'm-styles',
 'economie-francaise',
 'le-monde-passe-a-table',
 'politique',
 'international',
 'international',
 'afrique',
 'sport',
 'les-decodeurs

In [117]:
theme_stat = Counter(themes)
theme_top = []
for k,v in sorted(theme_stat.items(), key = lambda x:x[1], reverse=True):
    #if v > 700:
    theme_top.append((k, v))
print(theme_top)

[('international', 28), ('afrique', 17), ('sport', 15), ('societe', 11), ('culture', 9), ('idees', 4), ('pixels', 4), ('politique', 4), ('planete', 4), ('m-styles', 3), ('economie', 2), ('outre-mer', 2), ('disparitions', 2), ('le-monde-passe-a-table', 2), ('les-decodeurs', 2), ('sciences', 1), ('les-recettes-du-monde', 1), ('mondephilatelique', 1), ('economie-francaise', 1), ('intelligences-numeriques', 1)]


In [118]:
all_links = []
for link_list in article_links.values():
    all_links.extend(link_list)

themes_top_five = [x[0] for x in theme_top]

themes_top_five_links = classify_links(themes_top_five,all_links)
themes_top_five_links

defaultdict(list,
            {'international': ['https://www.lemonde.fr/international/article/2025/01/02/attaque-a-la-nouvelle-orleans-le-suspect-est-un-citoyen-americain-qui-avait-prete-allegeance-a-l-organisation-etat-islamique-annonce-le-fbi-qui-fait-etat-de-15-morts_6476750_3211.html',
              'https://www.lemonde.fr/international/article/2025/01/02/a-seoul-des-manifestants-pro-et-anti-yoon-s-affrontent-devant-sa-residence-officielle-ou-il-s-est-retranche_6478453_3210.html',
              'https://www.lemonde.fr/international/article/2025/01/02/elon-musk-reclame-la-liberation-de-tommy-robinson-figure-de-l-extreme-droite-britannique_6478320_3210.html',
              'https://www.lemonde.fr/international/article/2025/01/02/joe-biden-decore-liz-cheney-feroce-adversaire-de-donald-trump_6478184_3210.html',
              'https://www.lemonde.fr/international/article/2025/01/02/vingt-sept-migrants-meurent-au-large-des-cotes-tunisiennes-83-personnes-secourues-apres-deux-naufrages_64

In [119]:
first_links = next(iter(themes_top_five_links.values()))  # Gets the first value (list)
first_links[0]

'https://www.lemonde.fr/international/article/2025/01/02/attaque-a-la-nouvelle-orleans-le-suspect-est-un-citoyen-americain-qui-avait-prete-allegeance-a-l-organisation-etat-islamique-annonce-le-fbi-qui-fait-etat-de-15-morts_6476750_3211.html'

In [120]:
url = first_links[0]
try:
    html = urlopen(url)
except HTTPError as e:
    print("url not valid", url)
else:
    soup = BeautifulSoup(html, "html.parser")
with open('soup.txt', 'w', encoding="utf-8") as f:
    f.write(str(soup))

In [121]:
tags = soup.find_all(['tags'], recursive=False)
tags

[]

In [122]:
paragraphs = soup.article.find_all(["p", "h2"], recursive=False)
paragraphs

[<p class="article__paragraph">Le FBI a dit estimer, jeudi 2 janvier, que l’homme qui a foncé sur la foule au volant d’un pick-up dans la nuit de mardi à mercredi dans le Quartier français de La Nouvelle-Orléans, avait agi seul. <em>« Nous n’estimons pas à ce stade que d’autres personnes étaient impliquées dans cette attaque, à l’exception de Shamsud-Din Jabbar »</em>, le suspect décédé, a dit Christopher Raia, un haut responsable du FBI, lors d’une conférence de presse, alors que dans un premier temps la police fédérale estimait que le suspect de l’attaque aurait agi avec de <em>« potentiels complices »</em>.</p>,
 <p class="article__paragraph">Quinze personnes ont été tuées, selon le dernier bilan du FBI, dont l’auteur présumé de l’attaque.</p>,
 <p class="article__paragraph">LaToya Cantrell, maire de la ville, a rapidement déclaré qu’il s’agissait d’une <em>« attaque terroriste »</em>, tandis que la cheffe de la police de La Nouvelle-Orléans, Anne Kirkpatrick, excluait l’hypothèse d

In [123]:
create_folder('corpus')

scrape_articles(themes_top_five_links)


folder exists already
processing: international


100%|██████████| 24/24 [00:05<00:00,  4.10it/s]


processing: afrique


100%|██████████| 17/17 [00:04<00:00,  4.12it/s]


processing: sport


100%|██████████| 14/14 [00:03<00:00,  4.22it/s]


processing: societe


100%|██████████| 11/11 [00:02<00:00,  4.22it/s]


processing: culture


100%|██████████| 9/9 [00:02<00:00,  4.23it/s]


processing: idees


100%|██████████| 4/4 [00:00<00:00,  4.24it/s]


processing: pixels


100%|██████████| 2/2 [00:00<00:00,  3.70it/s]


processing: politique


100%|██████████| 4/4 [00:00<00:00,  4.42it/s]


processing: planete


100%|██████████| 4/4 [00:00<00:00,  4.71it/s]


processing: m-styles


100%|██████████| 3/3 [00:00<00:00,  3.56it/s]


processing: economie


100%|██████████| 2/2 [00:00<00:00,  4.07it/s]


processing: outre-mer


100%|██████████| 2/2 [00:00<00:00,  3.69it/s]


processing: disparitions


100%|██████████| 2/2 [00:00<00:00,  4.60it/s]


processing: le-monde-passe-a-table


100%|██████████| 2/2 [00:00<00:00,  3.86it/s]


processing: les-decodeurs


100%|██████████| 2/2 [00:00<00:00,  4.02it/s]


processing: sciences


100%|██████████| 1/1 [00:00<00:00,  3.54it/s]


processing: les-recettes-du-monde


100%|██████████| 1/1 [00:00<00:00,  5.08it/s]


processing: mondephilatelique


100%|██████████| 1/1 [00:00<00:00,  3.63it/s]


processing: economie-francaise


100%|██████████| 1/1 [00:00<00:00,  4.20it/s]


processing: intelligences-numeriques


100%|██████████| 1/1 [00:00<00:00,  4.58it/s]


In [124]:
path = 'corpus'
dico_corpus = cr_corpus_dict(path,1000)
dico_corpus

defaultdict(list,
            {'label': ['afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'afrique',
              'culture',
              'culture',
              'culture',
              'culture',
              'culture',
              'culture',
              'culture',
              'culture',
              'culture',
              'disparitions',
              'disparitions',
              'economie',
              'economie',
              'economie-francaise',
              'idees',
              'idees',
              'idees',
              'idees',
              'intelligences-numeriques',
              'international',
          