In [2]:
pip install torch transformers

Collecting torch
  Downloading torch-2.4.0-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting transformers
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.7 kB ? eta -:--:--
     ----------------------------------- -- 41.0/43.7 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 43.7/43.7 kB 530.4 kB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.24.5-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp311-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp311-none-win_amd64.whl.metadata (6.9 kB)
Downloading torch-2.4.0-cp311-cp311-win_amd64.whl (197.9 MB)
   ---------------------------------------- 0.0/197.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/197.9 MB 1.5 MB/s eta 0:0

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time

# URLs of the websites to scrape
urls = [
    "https://www.who.int",
    "https://www.unicef.org",
    "https://www.undp.org",
    "https://www.worldbank.org",
    "http://www.ins.tn",
    "https://www.afdb.org",
    "https://ftdes.net",
    "http://www.onm.nat.tn",
    "http://www.courdescomptes.nat.tn"
]

# Function to fetch HTML content from a URL with headers
def fetch_web_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    session = requests.Session()
    session.headers.update(headers)
    response = session.get(url)
    response.raise_for_status()
    return response

# Function to extract text from HTML content
def extract_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(separator=' ')
    return text

# Function to scrape and process URLs
def scrape_urls(urls):
    results = []
    for url in urls:
        try:
            response = fetch_web_page(url)
            content_type = response.headers.get('Content-Type', '').lower()
            
            if 'html' in content_type:
                text = extract_text_from_html(response.text)
                soup = BeautifulSoup(response.content, 'html.parser')
                for link in soup.find_all('a', href=True):
                    title = link.get_text(strip=True)
                    article_url = urljoin(url, link['href'])
                    if 'tunisia' in title.lower() or 'tunisie' in title.lower():
                        results.append({"url": article_url, "title": title})
            elif 'text' in content_type:
                text = response.text
                results.append({"url": url, "title": text[:100]})  # Example handling of text content
            else:
                print(f"Skipping URL {url} with content type {content_type}")
        
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred: {http_err}")
        except Exception as e:
            print(f"Failed to fetch or process {url}: {e}")
        finally:
            # Delay to avoid hitting the server too hard
            time.sleep(1)
    
    return results

# Scrape the websites
scraped_data = scrape_urls(urls)

# Save the results to a CSV file
df = pd.DataFrame(scraped_data)
df.to_csv("extractedDoc_articles.csv", index=False)

print("URLs and titles related to Tunisia saved to extracted_articles.csv")

HTTP error occurred: 403 Client Error: Forbidden for url: https://www.unicef.org/
HTTP error occurred: 403 Client Error: Forbidden for url: https://www.afdb.org/
URLs and titles related to Tunisia saved to extracted_articles.csv


In [7]:
import pandas as pd

# Load the CSV file
csv_file = "extractedDoc_articles.csv"
df = pd.read_csv(csv_file)

# Display the table in the notebook
df

Unnamed: 0,url,title
0,https://www.worldbank.org/en/where-we-work/tun...,Tunisia
1,http://www.ins.tn/evenements/la-mesure-de-la-p...,Atelier19/10/2023La mesure de la pauvreté en T...
2,http://www.ins.tn/evenements/seminaire-de-lanc...,SéminaireSéminaire de lancement du projet « l'...
3,http://www.ins.tn/evenements/workshop-sur-impl...,Atelier15 juillet 2020Workshop sur Implémentat...
4,http://www.ins.tn/evenements/atelier-de-lancem...,"Atelier10/10/2019L'ONM et l'INS, avec l’appui ..."
5,http://www.ins.tn/evenements/lancement-des-res...,RésultatsLancement des résultats de l’enquête ...
6,https://ftdes.net/resultats-preliminaires-dune...,Résultats préliminaires d’une étude de terrain...
7,https://ftdes.net/leconomie-circulaire-le-recy...,L’économie circulaire: le recyclage des déchet...
8,https://ftdes.net/rapport-mensuel-de-lobservat...,Rapport mensuel de l’Observatoire Social Tunis...
9,https://ftdes.net/resultats-preliminaires-dune...,Résultats préliminaires d’une étude de terrain...
