In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
import requests
import json

# Goal
- create a dataset of news artices from the Defimedia website
- scrape all categories of articles
- columns: - Id, Category, Title, Link

In [3]:
url="https://defimedia.info/categorie/actualites"


def category_scraper(category,num_pages):
    all_articles=[]

    for page_num in range(num_pages):
        base_url=f"https://defimedia.info/categorie/{category}?page={page_num}"
        page=get_html(base_url)
        articles=extract_articles(page)
        all_articles.extend(articles)
        print(f"Page {page_num}/{num_pages} scraped")
    
    return all_articles

def parallel_category_scraper(category, num_pages):
    all_articles=[]
    

def generate_csv(all_articles):
    articles_df=pd.DataFrame(all_articles, columns=['title', 'url','image_url','category'])
    articles_df.to_csv('defimedia_articles.csv', index=False)

def get_html(url):
    response=requests.get(url)
    return response.text

def extract_articles(web_page):
    soup=BeautifulSoup(web_page, 'html.parser')
    articles=soup.find_all('div', class_='article-teleplus-inner')

    extracted_articles=[]
    for article in articles:
        article_data=extract_article_data(article)
        extracted_articles.append(article_data)

    return extracted_articles
    # return articles


def extract_article_data(article):
    anchors=article.find_all('a')
    title=anchors[0].attrs['title']
    url=("https://defimedia.info"+str(anchors[0].attrs['href']))
    image_url=article.find('img').attrs['src']
    category=anchors[1].text.strip()
    
    article_data={
        'title': title,
        'url': url,
        'image_url': image_url,
        'category': category
    }

    return article_data



# page=get_html(url)
# articles=extract_articles(page)

# print(f"Number of articles: {len(articles)}")
# # print(articles[0])
# # print(extract_article_data(articles[0]))

# articles_df=pd.DataFrame(articles, columns=['title', 'url','image_url','category'])

# articles_df

# articles_df.to_csv('defimedia_articles.csv', index=False)



articles=category_scraper('actualites', 20)
generate_csv(articles)

Page 0/20 scraped
Page 1/20 scraped
Page 2/20 scraped
Page 3/20 scraped
Page 4/20 scraped
Page 5/20 scraped


KeyboardInterrupt: 

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

url="https://defimedia.info/categorie/actualites"


def category_scraper(category,num_pages):
    all_articles=[]

    for page_num in range(num_pages):
        base_url=f"https://defimedia.info/categorie/{category}?page={page_num}"
        page=get_html(base_url)
        articles=extract_articles(page)
        all_articles.extend(articles)
        print(f"Page {page_num}/{num_pages} scraped")
    
    return all_articles

def all_scraper(categories):
    all_articles=[]
    for category in categories:
        articles=parallel_category_scraper(category, categories[category])
        all_articles.extend(articles)
        print(f"Category {category} scraped")
    
    return all_articles


def parallel_category_scraper(category, num_pages):
    url_list=[]
    for page_num in range(num_pages):
        base_url=f"https://defimedia.info/categorie/{category}?page={page_num}"
        url_list.append(base_url)

    all_articles=[]
    # Use ThreadPoolExecutor to fetch pages in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(get_html, url) for url in url_list]

        for i, future in enumerate(as_completed(futures), 1):
            page = future.result()
            articles = extract_articles(page)
            all_articles.extend(articles)
            print(f"Page {i}/{len(url_list)} scraped")
    
    return all_articles

    

def generate_csv(all_articles):
    articles_df=pd.DataFrame(all_articles, columns=['title', 'url','image_url','category'])
    articles_df.to_csv('defimedia_articles.csv', index=True)

def get_html(url):
    response=requests.get(url)
    return response.text

def extract_articles(web_page):
    soup=BeautifulSoup(web_page, 'html.parser')
    articles=soup.find_all('div', class_='article-teleplus-inner')

    extracted_articles=[]
    for article in articles:
        try:
            article_data=extract_article_data(article)
            extracted_articles.append(article_data)
        except:
            print("Error extracting article data")
            continue

    return extracted_articles
    # return articles


def extract_article_data(article):
    anchors=article.find_all('a')
    title=anchors[0].attrs['title']
    url=("https://defimedia.info"+str(anchors[0].attrs['href']))
    image_url=article.find('img').attrs['src']
    category=anchors[1].text.strip()
    
    article_data={
        'title': title,
        'url': url,
        'image_url': image_url,
        'category': category
    }

    return article_data



# page=get_html(url)
# articles=extract_articles(page)

# print(f"Number of articles: {len(articles)}")
# # print(articles[0])
# # print(extract_article_data(articles[0]))

# articles_df=pd.DataFrame(articles, columns=['title', 'url','image_url','category'])

# articles_df

# articles_df.to_csv('defimedia_articles.csv', index=False)



# articles=category_scraper('actualites', 20)
# articles=parallel_category_scraper('actualites', 2003)
# generate_csv(articles)


categories={
    'actualites': 2003,
    'faits-divers': 772,
    'explikouka': 314,
    'defi-zen': 71,
    'politique': 142,
    'people': 224,
    'magazine': 309,
    'news-sunday': 226,
    'catastrophe': 2,
    'live-news': 10765
}

articles=all_scraper(categories)
generate_csv(articles)

Page 1/2003 scraped
Page 2/2003 scraped
Page 3/2003 scraped
Page 4/2003 scraped
Page 5/2003 scraped
Page 6/2003 scraped
Page 7/2003 scraped
Page 8/2003 scraped
Page 9/2003 scraped
Page 10/2003 scraped
Page 11/2003 scraped
Page 12/2003 scraped
Page 13/2003 scraped
Page 14/2003 scraped
Page 15/2003 scraped
Page 16/2003 scraped
Page 17/2003 scraped
Page 18/2003 scraped
Page 19/2003 scraped
Page 20/2003 scraped
Page 21/2003 scraped
Page 22/2003 scraped
Page 23/2003 scraped
Page 24/2003 scraped
Page 25/2003 scraped
Page 26/2003 scraped
Page 27/2003 scraped
Page 28/2003 scraped
Page 29/2003 scraped
Page 30/2003 scraped
Page 31/2003 scraped
Page 32/2003 scraped
Page 33/2003 scraped
Page 34/2003 scraped
Page 35/2003 scraped
Page 36/2003 scraped
Page 37/2003 scraped
Page 38/2003 scraped
Page 39/2003 scraped
Page 40/2003 scraped
Page 41/2003 scraped
Page 42/2003 scraped
Page 43/2003 scraped
Page 44/2003 scraped
Page 45/2003 scraped
Page 46/2003 scraped
Page 47/2003 scraped
Page 48/2003 scraped
P

ConnectionError: HTTPSConnectionPool(host='defimedia.info', port=443): Max retries exceeded with url: /categorie/actualites?page=386 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x13c5a7df0>: Failed to resolve 'defimedia.info' ([Errno 8] nodename nor servname provided, or not known)"))