In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from time import sleep

urls = []

def gather_links(url, crawl_delay=1):
    '''
    Function to gather all the links from a given base URL (https://gat.ac.in/) in a recursive manner.
    The function will keep on crawling the links until all the links are exhausted.
    All urls will be appended to a list and written to a file named urls.txt
    
    Args:
    url: str: base URL to crawl

    Returns:
    urls: list: list of all the URLs crawled
    '''
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to get URL. Status code: {response.status_code}")
        return None
    
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')
    links = soup.find_all('a')

    for link in links:
        href = link.get('href')
        if href and not href.startswith('http'):
            full_url = urljoin(url, href)
        else:
            full_url = href

        if (not full_url) or (not full_url.startswith(url)) or \
            full_url == url or full_url.endswith(('pdf', 'jpg', 'png')) or \
                full_url in urls:
            continue
        urls.append(full_url)
        print(f"Adding {full_url}")
        gather_links(full_url)

    sleep(crawl_delay)
    return urls

In [3]:
# To gather all the links from the base URL

base_url = "https://gat.ac.in/"
crawl_delay = 0.1
gather_links(base_url, crawl_delay)

with open('urls.txt', 'w') as f:
    for url in urls:
        f.write(url + '\n')

print(f"Total number of URLs: {len(urls)}")

Adding https://gat.ac.in/alumni
Adding https://gat.ac.in/alumni-activities
Adding https://gat.ac.in/alumni-testimonials
Adding https://gat.ac.in/committees-iqac
Adding https://gat.ac.in/gat-news
Adding https://gat.ac.in/gat-events
Adding https://gat.ac.in/gat-nirf
Adding https://gat.ac.in/contactus
Adding https://gat.ac.in/why-gat
Adding https://gat.ac.in/green-campus-initiative
Adding https://gat.ac.in/facilities-audiotorium
Adding https://gat.ac.in/about-gat
Adding https://gat.ac.in/index
Adding https://gat.ac.in/nef
Adding https://gat.ac.in/nef-institution
Adding https://gat.ac.in/vision-mission
Adding https://gat.ac.in/chairman-message
Adding https://gat.ac.in/board-message
Adding https://gat.ac.in/principal-message
Adding https://gat.ac.in/dean-message
Adding https://gat.ac.in/governing-body
Adding https://gat.ac.in/gat-careers
Adding https://gat.ac.in/undergraduate-program
Adding https://gat.ac.in/postgraduate-program
Adding https://gat.ac.in/ug-pg-curriculum
Adding https://gat.a

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Adding https://gat.ac.in/ai-datascience-faculty
Adding https://gat.ac.in/ai-datascience-faculty#tab1
Adding https://gat.ac.in/ai-datascience-faculty#tab2
Adding https://gat.ac.in/ai-datascience-faculty#tab3
Adding https://gat.ac.in/ai-datascience-faculty#tab4
Adding https://gat.ac.in/ai-datascience-faculty#tab5
Adding https://gat.ac.in/ai-datascience-infrastructure
Adding https://gat.ac.in/ai-datascience-achievements
Adding https://gat.ac.in/ai-datascience-news
Adding https://gat.ac.in/ai-datascience-events
Adding https://gat.ac.in/ai-datascience-gallery
Adding https://gat.ac.in/ciisca/
Adding https://gat.ac.in/ciisca/index
Adding https://gat.ac.in/ciisca/gat-committee
Adding https://gat.ac.in/ciisca/key-note
Adding https://gat.ac.in/ciisca/call-for-paper
Adding https://gat.ac.in/ciisca/important-dates
Adding https://gat.ac.in/ciisca/registration
Adding https://gat.ac.in/ciisca/venue
Adding https://gat.ac.in/ciisca/ciisca-downloads
Adding https://gat.ac.in/ciisca/partners
Adding https:

In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_text(url):
    '''
    Function to scrape text from a given URL.
    The function will scrape the main content of the URL and remove all the unwanted text like footer, header, sidebar, links, images, etc.

    Args:
    url: str: URL to scrape

    Returns:
    text: str: scraped text from the URL
    '''

    url = url.strip()
    print(f"Scraping {url}")
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to get URL. Status code: {response.status_code}")
        return
    
    content = response.content
    soup = BeautifulSoup(content, 'html.parser')
    
    main_content = soup.find('div', class_='innerpage-container')
    if not main_content:
        return

    for script in main_content(["script", "style"]):
        script.extract()

    # remove all texts related to the footer, header, and sidebar, hrefs, and images
    try:
        footer = main_content.find('footer')
        if footer:
            footer.decompose()
        header = main_content.find('header')
        if header:
            header.decompose()

        sidebar = main_content.find('aside')
        if sidebar:
            sidebar.decompose()

        links = main_content.find_all('a')
        for link in links:
            link.decompose()

        images = main_content.find_all('img')
        for image in images:
            image.decompose()
    except Exception as e:
        return

    text = main_content.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)

    # f.write(f"These are the details related to {url.split('/')[-1]}:\n`{text}`\nThis is the end of the details.\n\n")
    return f"These are the details related to:- {url.split('/')[-1]}:\n```{text}```\n\n"

In [8]:
# Scrape text from the URLs

from time import sleep

with open('gat_urls.txt', 'r') as f:
    urls = f.readlines()

with open('gat_raw.txt', 'w', encoding='utf-8') as f:
    for url in urls:
        text = scrape_text(url)
        if text:
            f.write(text)
        sleep(0.1)

print("Scraping completed!")

Scraping https://gat.ac.in/alumni
Scraping https://gat.ac.in/alumni-activities
Scraping https://gat.ac.in/alumni-testimonials
Scraping https://gat.ac.in/committees-iqac
Scraping https://gat.ac.in/gat-news
Scraping https://gat.ac.in/gat-events
Scraping https://gat.ac.in/gat-nirf
Scraping https://gat.ac.in/contactus
Scraping https://gat.ac.in/why-gat
Scraping https://gat.ac.in/green-campus-initiative
Scraping https://gat.ac.in/facilities-audiotorium
Scraping https://gat.ac.in/about-gat
Scraping https://gat.ac.in/index
Scraping https://gat.ac.in/nef
Scraping https://gat.ac.in/nef-institution
Scraping https://gat.ac.in/vision-mission
Scraping https://gat.ac.in/chairman-message
Scraping https://gat.ac.in/board-message
Scraping https://gat.ac.in/principal-message
Scraping https://gat.ac.in/dean-message
Scraping https://gat.ac.in/governing-body
Scraping https://gat.ac.in/gat-careers
Scraping https://gat.ac.in/undergraduate-program
Scraping https://gat.ac.in/postgraduate-program
Scraping https

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Scraping https://gat.ac.in/ai-datascience-faculty
Scraping https://gat.ac.in/ai-datascience-faculty#tab1
Scraping https://gat.ac.in/ai-datascience-faculty#tab2
Scraping https://gat.ac.in/ai-datascience-faculty#tab3
Scraping https://gat.ac.in/ai-datascience-faculty#tab4
Scraping https://gat.ac.in/ai-datascience-faculty#tab5
Scraping https://gat.ac.in/ai-datascience-infrastructure
Scraping https://gat.ac.in/ai-datascience-achievements
Scraping https://gat.ac.in/ai-datascience-news
Scraping https://gat.ac.in/ai-datascience-events
Scraping https://gat.ac.in/ai-datascience-gallery
Scraping https://gat.ac.in/ciisca/
Scraping https://gat.ac.in/ciisca/index
Scraping https://gat.ac.in/ciisca/gat-committee
Scraping https://gat.ac.in/ciisca/key-note
Scraping https://gat.ac.in/ciisca/call-for-paper
Scraping https://gat.ac.in/ciisca/important-dates
Scraping https://gat.ac.in/ciisca/registration
Scraping https://gat.ac.in/ciisca/venue
Scraping https://gat.ac.in/ciisca/ciisca-downloads
Scraping https

* Once the raw text is extracted from the website, it is saved as 'gat_raw.txt'.
* Later, the text is manually cleaned and formatted as per the requirements for 'Retrieval Augmented Generation' (RAG) task.
* The cleaned text is saved as 'gat_refined.txt'.