In [None]:
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import time
import csv
import nltk
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
nltk.download('punkt')

BASE_URL = "https://thepeninsulaqatar.com"
CATEGORIES = [
              "https://thepeninsulaqatar.com/category/cinema",
              "https://thepeninsulaqatar.com/category/music",
              "https://thepeninsulaqatar.com/category/general",
              "https://thepeninsulaqatar.com/category/Gulf"]

def requests_retry_session(retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def safe_get(url, retries=3):
    for i in range(retries):
        try:
            return requests_retry_session().get(url)
        except requests.exceptions.ConnectionError as e:
            print(f"Connection error on {url}, retrying... ({i + 1}/{retries})")
            time.sleep(5)  # Add sleep between retries
    print(f"Failed to get {url} after {retries} attempts")
    return None

def get_article_links(page_url):
    response = safe_get(page_url)
    if response is None:
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')

    articles = soup.select('div.col-sm-6.item')
    article_links = [BASE_URL + article.find('a', class_='title')['href'] for article in articles if article.find('a', class_='title')]
    return article_links

def scrape_article(url):
    article = Article(url)
    try:
        article.download()
        article.parse()
        article.nlp()

        data = {
            'title': article.title,
            'authors': article.authors,
            'publish_date': article.publish_date,
            'text': article.text,
            'keywords': article.keywords,
            'summary': article.summary,
            'url': url
        }
        return data
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

def get_next_page_url(soup):
    active_page = soup.select_one('li.page-item a.page-link.active')
    if not active_page:
        return None

    active_page_li = active_page.find_parent('li')
    if not active_page_li:
        return None

    next_page_li = active_page_li.find_next_sibling('li')
    if not next_page_li:
        return None

    next_page = next_page_li.find('a', class_='page-link')
    if next_page and 'href' in next_page.attrs:
        return next_page['href']

    return None

def scrape_category(category_url, category_name):
    current_url = category_url
    category_articles = []

    while current_url:
        print(f"Scraping page: {current_url}")
        
        article_links = get_article_links(current_url)
        if not article_links:
            print(f"No articles found on {current_url}")
            break

        for link in article_links:
            article_data = scrape_article(link)
            if article_data:
                category_articles.append(article_data)

        response = safe_get(current_url)
        if response is None:
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        next_page_url = get_next_page_url(soup)
        current_url = next_page_url if next_page_url else None

        time.sleep(2)

    if category_articles:
        save_to_csv(f'scraped_articles_{category_name}.csv', category_articles)
        print(f"Saved articles for category: {category_name}")
        category_articles.clear()

def save_to_csv(filename, data):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        for row in data:
            writer.writerow(row)

def main():
    for category_url in CATEGORIES:
        category_name = category_url.split('/')[-1] 
        print(f"Scraping category: {category_name}")
        scrape_category(category_url, category_name)
        print(f"Finished scraping category: {category_name}")
        time.sleep(5)

if __name__ == "__main__":
    main()