In [None]:
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd
import warnings
import threading
import concurrent.futures
warnings.filterwarnings("ignore")
import time

In [None]:
article_counter = 1
counter_lock = threading.Lock()

In [None]:
save_dir = r'E:\IITGN_MTECH\sem-3rd\natural-lanaguage-processing\assignment-nlp-1\euronews_articles'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [None]:
def save_article_to_txt(link, title, decription, author, creation_date, last_update, content, size, article_counter , save_dir):
    """Save article content to a .txt file with a sequential filename."""
    filename = f"article_{article_counter}.txt"
    filepath = os.path.join(save_dir, filename)
    print(save_dir)

    try:
        # Ensure save_dir exists
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Prepare the article content for saving
        article_data_txt = (
            f"Link: {link}\n"
            f"Title: {title}\n"
            f"Description: {description}\n"
            f"Author: {author}\n"
            f"Creation Date: {creation_date}\n"
            f"Last Update: {last_update}\n"
            f"Article Content: {content}\n"
            f"Article Size: {size}B\n"
        )

        # Write to the file
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(article_data_txt)

        print(f"Successfully saved article {filename}")

    except Exception as e:
        print(f"Failed to save article {filename}: {e}")

In [None]:
def get_soup(url):
    """Fetch and parse the HTML content of a URL using BeautifulSoup."""
    try:
        # scraperapi_url = f'http://api.scraperapi.com?api_key={api_key}&url={url}'
        for _ in range(2):
              response = requests.get(url, headers=headers)
            #   response = requests.get(scraperapi_url, headers=headers)
              response.raise_for_status()  # Check for HTTP request errors
        if response.status_code == 200:
              print("Page retrieved successfully!")
              return BeautifulSoup(response.text, 'html.parser')
        elif response.status_code == 429:
                  print("Rate limit exceeded. Waiting 10 seconds...")
                  time.sleep(5)
        else:
              print("Failed to retrieve the page:", response.status_code)
    except requests.exceptions.HTTPError as http_err:
        print(f"Request failed: {http_err}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
def extract_article_info(url ,article_soup):
    """Extract required fields from the article BeautifulSoup object."""
    try:

        title_tag = article_soup.find('h1', itemprop='headline')
        title = title_tag.text.strip() if title_tag else "no disponible"

        description_tag = article_soup.find(class_="ue-c-article__standfirst")
        description = description_tag.text.strip() if description_tag else "no disponible"

        author_tag = article_soup.find(class_="ue-c-article__byline-name")
        author = author_tag.text.strip() if author_tag else "no disponible"

        creation_date_tag = article_soup.find('meta', attrs={'property': 'article:published_time'})
        creation_date = creation_date_tag.get('content', 'no disponible') if creation_date_tag else "no disponible"

        last_update_tag = article_soup.find('meta', attrs={'property': 'article:modified_time'})
        last_update = last_update_tag.get('content', 'no disponible') if last_update_tag else "no disponible"

        content = ""
        content_div = article_soup.find(class_="NormalTextoNoticia")

        if content_div:
            paragraphs = content_div.find_all('p')
            content += "\n".join([p.get_text(strip=True) for p in paragraphs]) if paragraphs else "no disponible"
        else:
            content = "no disponible"

        article_size = len(content.encode('utf-8'))
        # print(title , content , article_size)
        return title, description, author, creation_date, last_update, content, article_size

    except Exception as e:
        print(f"Failed to extract information: {e}")
        return None


In [None]:
def fetch_and_process_articles(url , save_dir, article_links,start):
    global article_counter
    soup = get_soup(url)
    # if soup is None or is_error_page(soup):
    #     return
    try:
        # Safely find the container
        c_ = soup.find('div' , id = "ListaUltimasNoticias")
        if c_:  # Ensure the parent container is found
            c__ = c_.findAll('article')
            for content_item in c__:
                content_item = content_item.find(class_='flipper')
                content_item = content_item.find(class_='articulo-interior')
                content_item = content_item.find(class_ = 'articulo-titulo')
                article_link_soup = content_item.find('a')
                if article_link_soup:
                      article_url = article_link_soup.get('href')  # Extract the URL from 'a' tag
                      if article_url and article_url not in article_links:
                            article_links.add(article_url)
                            article_soup = get_soup(article_url)  # Fetch article page

                            if article_soup:
                                try:
                                    article_info = extract_article_info(article_url ,  article_soup)

                                    if article_info:
                                        title, description, author, creation_date, last_update, content, article_size = article_info

                                        if title != "no disponible" and content != "no disponible" :
                                            with counter_lock:
                                                current_counter = article_counter
                                                article_counter += 1
                                            save_article_to_txt(article_url, title, description , author, creation_date, last_update, content, article_size, current_counter, save_dir)
                                            print(f"Saved article_done: {article_url , start}")
                                except Exception as e:
                                    print(f"Failed to process article {article_url}: {e}")

    except Exception as e:
        print(f"Failed to find article elements: {e}")

In [None]:
def main():
    base_url = r"https://www.europapress.es/"
    start , end = 1 , 999
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        article_links = set()
        while start != end:
            section = f"/p{start}/"
            url = base_url + section
            executor.submit(fetch_and_process_articles, url, save_dir, article_links,start)
            start += 1


In [None]:

if __name__ == "__main__":
	main()