# Data Extraction and Pre-processing from NADRA Website
In this notebook, i extract text data from multiple pages on the NADRA website. I use BeautifulSoup library to extract data and Pre-process each html page using REGEX. The raw HTML content is saved into individual HTML files, and the processed text is stored in separate text files.


 Import Libraries and Set Up URLs

In [7]:
import requests
from bs4 import BeautifulSoup
import re
# 10 html pages from the nadra.gov.pk website
urls = [
    "https://www.nadra.gov.pk/",
    "https://www.nadra.gov.pk/international-projects/",
    "https://www.nadra.gov.pk/local-projects/",
    "https://www.nadra.gov.pk/identity/",
    "https://www.nadra.gov.pk/national-identity-card/",
    "https://www.nadra.gov.pk/downloads/",
    "https://tenders.nadra.gov.pk/",
    "https://www.nadra.gov.pk/media-releases/",
    "https://www.nadra.gov.pk/operational-management/",
    "https://careers.nadra.gov.pk/",
]


Extract HTML Content from URLs and store HTML pages

In [8]:
# method for extracting html content from url
def get_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    return None

# Extract and store html pages
html_pages = []   # list for storing html content
for url in urls:
    page_html = get_html(url)
    if page_html:
        html_pages.append(page_html)
        #  save the html to files
        with open(f"html_file_{urls.index(url)+1}.html", "w", encoding="utf-8") as file:
            file.write(page_html)

print(f"{len(html_pages)} html pages extracted.")

10 html pages extracted.


Clean HTML Pages Using BeautifulSoup and Regex.
And Store cleaned data.

In [9]:

# method to clean all html tags, special characters, advertisements/ extra link
def clean_html(raw_html):
    # Remove all HTML tags using BeautifulSoup
    soup = BeautifulSoup(raw_html, 'html.parser')

  # Remove all img tags
    for img in soup.find_all('img'):
        img.decompose()

    # Remove ads
    ad_classes = ['ad', 'advertisement','advertisements', 'sponsored']
    for ad_class in ad_classes:
        for ad in soup.find_all(class_=ad_class):
            ad.decompose()

    clean_text = soup.get_text()

    # Remove special characters using regex
    clean_text = re.sub(r'\s+', ' ', clean_text)
    clean_text = re.sub(r'[^\w\s]', '', clean_text)

    # remove extra links/urls
    clean_text = re.sub(r'http\S+', '', clean_text)

    return clean_text.strip()

# Clean the HTML pages
cleaned_pages = [] # list for storing cleaned content
for html in html_pages:
    cleaned_text = clean_html(html)
    cleaned_pages.append(cleaned_text)

# Save cleaned text files
for i, text in enumerate(cleaned_pages):
    with open(f"Cleaned_file_{i+1}.txt", "w", encoding="utf-8") as file:
        file.write(text)

print("html pages are cleaned and saved.")

html pages are cleaned and saved.
