<a href="https://colab.research.google.com/github/mohamed-stifi/PFA-Arabic-LLMs/blob/main/scraping_mawdoo3_articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup

**Scraping mawdoo3.com**

In [None]:
def filter_fun(tag):
  """
    Custom filter function to be used with BeautifulSoup find_all.
    Filters out script tag.

    Parameters:
    - tag (bs4.element.Tag): A BeautifulSoup Tag object.

    Returns:
    - bool: True if the tag should be included, False otherwise.
  """
  return tag.name != "script"

def del_attrs(soup):
    """
    Recursively removes all attributes from the given BeautifulSoup tree.

    Parameters:
    - soup (bs4.BeautifulSoup): A BeautifulSoup object representing the HTML document.
    """
    tags = soup.find_all(True, recursive= False)
    if len(tags) > 0:
        for tag in tags:
            tag.attrs = {}
            del_attrs(tag)


In [None]:
def mowdoo3_topics(url = 'https://mawdoo3.com'):
    """
    Scrapes and retrieves a list of topic URLs from the Mawdoo3 website.

    Parameters:
    - url (str): The URL of the Mawdoo3 website. Default is 'https://mawdoo3.com'.

    Returns:
    - list: A list of strings representing the full URLs of topics on Mawdoo3.
    """

    # Send a GET request to the specified URL
    res = requests.get(url)

    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(res.content, 'html.parser')

    # Find the main section containing the topics
    main_section = soup.find_all('section', class_ = 'container home')[0]

    # Find the block content within the main section
    block_content = main_section.find_all('div', class_ = "row block-content")[0]

    # Find all 'a' tags within the block content
    a_tags = block_content.find_all('a')

    # Extract href attributes from 'a' tags and construct full URLs
    hrefs = ['https://mawdoo3.com'+tag['href'] for tag in a_tags]

    # Return the list of topic URLs
    return hrefs

In [None]:
def articls_of_mawdoo3_topic(topic_url):
        """
        Scrapes and retrieves a list of article URLs for a specific topic on Mawdoo3.

        Parameters:
        - topic_url (str): The URL of the Mawdoo3 topic page.

        Returns:
        - list: A list of strings representing the full URLs of articles in the specified topic.
        """

        # Send a GET request to the specified topic URL
        res = requests.get(topic_url)

        # Parse the HTML content of the topic page using BeautifulSoup
        soup = BeautifulSoup(res.content, 'html.parser')

        # Find the content section containing the articles
        content = soup.find_all('div', class_ = 'columns large-8 medium-12 small-12')[0].find_all('ul', id = "grid")[0]

        # Find all 'a' tags within the content section
        a_tags = content.find_all('a')

        # Extract href attributes from 'a' tags and construct full URLs
        articls_urls = ['https://mawdoo3.com'+tag['href'] for tag in a_tags]

        # Pagination loop
        i = 2
        while len(a_tags) == 132:

            # Generate the URL for the next page
            next_page = topic_url+f'?page={i}'

            # Send a GET request to the next page
            res = requests.get(next_page)
            soup = BeautifulSoup(res.content, 'html.parser')

            # Find the content section on the next page
            content = soup.find_all('div', class_ = 'columns large-8 medium-12 small-12')[0].find_all('ul', id = "grid")[0]
            # Find all 'a' tags on the next page
            a_tags = content.find_all('a')

            # Break the loop if no more articles are found
            if len(a_tags) == 0:
                break

            # Extract href attributes from 'a' tags on the next page and add to the list
            for tag in a_tags:
                articls_urls.append('https://mawdoo3.com'+tag['href'])

            i = i + 1
        return articls_urls

In [None]:
def get_mawdoo3_articls_urls(url = 'https://mawdoo3.com'):
    """
    Scrapes and retrieves a list of article URLs from Mawdoo3 across all topics.

    Parameters:
    - url (str): The URL of the Mawdoo3 website. Default is 'https://mawdoo3.com'.

    Returns:
    - list: A list of strings representing the full URLs of articles on Mawdoo3.
    """

    # Initialize an empty list to store article URLs
    aricls_urls = []

    # Get a list of URLs for all topics on Mawdoo3
    topics_urls = mowdoo3_topics(url)

    # Iterate over each topic URL
    for ind, topic_url in enumerate(topics_urls):

        # Retrieve a list of article URLs for the current topic
        topic_articls_urls = articls_of_mawdoo3_topic(topic_url)

        # Append each article URL to the overall list
        for articl_url in topic_articls_urls :
            aricls_urls.append(articl_url)

        print('----------- : ', ind)
    # Return the list of all article URLs
    return aricls_urls


In [None]:
def get_one_articls(url):
    """
    Scrapes and extracts content from a single article on Mawdoo3.

    Parameters:
    - url (str): The URL of the Mawdoo3 article.

    Returns:
    - tuple: A tuple containing three elements:
      1. titel of the article.
      2. Plain text version of the article content.
      3. HTML version of the article content.
    """

    # Send a GET request to the specified article URL
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')

    # Find the article section using the 'article' tag
    article = soup.find('article')

    # Extract the title of the article
    titel = article.h1.get_text(strip=True)

    text = article.select_one('[id="mw-content-text"]')

    # Extract plain text version of the article content
    article_text = text.get_text(strip=True, separator="\n") if text else ""

    # Find the index of the substring '\nالمراجع\n'
    start_index = article_text.find('\nالمراجع\n')

    # If the substring is found, remove it and everything after it
    if start_index != -1:
        article_text = article_text[:start_index].strip()

    # Find all tags within the article content based on the filter function
    text_tags = text.find_all(lambda tag: filter_fun(tag), recursive= False)

    # Remove attributes from all tags within the article content
    for tag in text_tags:
        del_attrs(tag)

    # Concatenate the HTML version of the article content
    article_html = ''.join(str(text_tags[3:]))

    # Find the index of the substring '\nالمراجع\n'
    start_index = article_html.find('<h2><span><b>المراجع</b></span></h2>')

    # If the substring is found, remove it and everything after it
    if start_index != -1:
        article_html = article_html[:start_index].strip()

    return titel, article_text, article_html[1:-1]

In [None]:
aricls_urls = get_mawdoo3_articls_urls()

In [None]:
len(aricls_urls)

77715

In [None]:
import json
path = '/content/drive/MyDrive/mowdoo3 _dataset/urls_of_articls_of_mawdoo3.json'
with open(path, "w", encoding="utf-8") as json_file:
    json.dump(aricls_urls, json_file, ensure_ascii=False, indent=4)

In [None]:
import json
path = '/content/drive/MyDrive/mowdoo3 _dataset/urls_of_articls_of_mawdoo3.json'
with open(path, "r", encoding="utf-8") as json_file:
    aricles_urls = json.load(json_file)

In [None]:
list_of_articles = []
columns = ['article_titel', 'article_text', 'article_html', 'article_url']

In [None]:
for i, article_url in enumerate(aricles_urls):
    titel, article_text, article_html = get_one_articls(article_url)
    list_of_articles.append([titel, article_text, article_html, article_url])

    # print("------------------- article -------- : ", i)


In [None]:
import json
data1 =list_of_articles[:49765]
data2 = list_of_articles[49765:]
path1 = "/content/drive/MyDrive/mowdoo3 _dataset/articls_of_mawdoo3_from_1_to_22748.json"
path2  = "/content/drive/MyDrive/mowdoo3 _dataset/articls_of_mawdoo3_from_49765_to_77714.json"
with open(path1, "w", encoding="utf-8") as json_file:
    json.dump(data1, json_file, ensure_ascii=False, indent=4)
with open(path1, "w", encoding="utf-8") as json_file:
    json.dump(data2, json_file, ensure_ascii=False, indent=4)

In [None]:
import json
path1 = "/content/drive/MyDrive/mowdoo3 _dataset/articls_of_mawdoo3_from_1_to_22748.json"
path2  = "/content/drive/MyDrive/mowdoo3 _dataset/articls_of_mawdoo3_from_49765_to_77714.json"
with open(path1, "r", encoding="utf-8") as json_file:
    aricls1 = json.load(json_file)
with open(path2, "r", encoding="utf-8") as json_file:
    aricls2= json.load(json_file)

In [None]:
len(aricls1['data'])

49765

**Scraping islamqa.info**

In [None]:
question_answer_data = {'inputs':[], 'outputs':[], 'urls':[]}


In [None]:
from bs4 import BeautifulSoup
import requests
import json
urls_path = "/content/drive/MyDrive/islamqa_dataset/urls_of_islamqa_.json"
with open(urls_path, "r", encoding="utf-8") as json_file:
    loaded_data = json.load(json_file)

print("length of list : ", len(loaded_data))          #length of list :  36807
print("length of set : ", len(set(loaded_data)))      #length of set :  29114

dataset = list(set(loaded_data))
n = len(dataset)
end1 = 5000
# question_answer_data = {'inputs':[], 'outputs':[], 'urls':[]}
for ind, url in enumerate(dataset[:end1]):
    question_answer_data['urls'].append(url)
    # print(url)
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        question = soup.find_all('section', class_ = 'single_fatwa__question text-justified')[0]
        p_tag = question.find('p')          #.get_text()
        if p_tag is not None :
            question_text = p_tag.get_text()
        else:
            question_text = question.find('div').get_text()

        question_answer_data['inputs'].append(question_text)

        answer = soup.find_all('section', class_ = 'single_fatwa__answer')[0].find('div', class_ = 'content')
        answer_paragrphs = answer.find_all('p')
        full_answer = '\n'.join([p.get_text() for p in answer_paragrphs])
        question_answer_data['outputs'].append(full_answer)
    print('---'*4+" : " + str(ind) + " % ", ind/n)
question_answer_data_path = f"/content/drive/MyDrive/islamqa_dataset/question_answer_data_end1_{end1}.json"
with open(question_answer_data_path, "w", encoding="utf-8") as json_file:
    json.dump(question_answer_data, json_file, ensure_ascii=False, indent=4)


'''
was stoe in the next of
------------ : 2402 %  0.082503263034966
'''