## Importing Packages

In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup

In [2]:
article_ids = {
    'politics': 78,
    'accidents': 7,
    'sports': 8,
    'economics': 4
}

### Getting Article URLs
Assuming we are interested in Sport, Politics, Accidents and Economics Articles.

In [3]:
def get_articles_urls(article_id, page):
    """
    Given article_id and page number, we fetch all the articles' urls in that page.
    returns List of urls.
    """
    url = 'https://www.almasryalyoum.com/news/index?page=' + str(page) + '&sectionid=' + str(article_id) + '&typeid=1'
    response = requests.get(url)
    
    if not response.ok:
        return False
    
    else:
        soup = BeautifulSoup(response.text)
        list_of_news = soup.find('div', attrs={'class':'ListNews'}).findAll('div', attrs = {'class':'news'})
        
        article_urls = []
        for news in list_of_news:
            article_urls.append('https://www.almasryalyoum.com' + news.find('a')['href'])
        
        if not article_urls: 
            return False
        
        return article_urls

### Getting Article Details 

In [4]:
def get_article_details(url):
    """
    Returning various information of an article.
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    news_title = soup.title.text
    news_image = soup.find('div', attrs= {'class':'articleimg'}).find('img')['src']
    body = soup.find('div',attrs={'id':'NewsStory'})
    
    # removing unraleted text
    body.find('div',attrs={'class':'smsBoxContainer-v1'}).decompose()
    body.find('div',attrs={'class':'min_related'}).decompose()
    
    news_body = ""
    for p in body.findAll('p'):
        news_body += " " + p.text
     
    news_keywords = soup.find('meta',attrs={'name':'keywords'})['content']    
    
    news_article = {
        'news_url': url,
        'news_title': news_title,
        'news_image':news_image,
        'news_body': news_body,
        'news_keywords': news_keywords
    }
    
    return news_article

### Function return a pandas DataFrame of all news of a specific section/page

In [5]:
def get_articles(article_id = 8, page = 2):
    """
    Return a pandas DataFrame of all news/articles in a specific page.
    """
    topic_articles = []
    urls = get_articles_urls(article_id, page)
    
    if hasattr(urls, '__iter__'):
        for url in get_articles_urls(article_id, page):
            topic_articles.append(get_article_details(url))
    
        return pd.DataFrame(topic_articles)
    else:
        return pd.DataFrame()

In [None]:
sports_articles = get_articles()

In [None]:
sports_articles