### Script to extract relevant information from a page. The script involves customization according to how a page is designed, a website specific tags and markers

In [3]:
import requests
from bs4 import BeautifulSoup, Tag

In [4]:
# URL to scrape
#url = "https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/provincial-nominees.html"  # Replace with your target URL
url = "https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/who-can-apply"
url ="https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/caregivers"
url = "https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/caregivers/home-care-worker-immigration-pilots"

In [5]:
headers = {"User-Agent": "Mozilla/5.0"}



In [6]:
url_list = []

f = open("../data/crawled_page_links.txt")

for line in f:
    url_list.append(line.strip())

In [7]:
url_list[0:10]

['https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit-account',
 'https://www.canada.ca/en/immigration-refugees-citizenship/campaigns/immigration-matters/growing-canada-future/business',
 'https://www.canada.ca/en/immigration-refugees-citizenship/services/application/application-forms-guides/imm0268',
 'https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/agri-food-pilot/work-permit',
 'https://www.canada.ca/en/immigration-refugees-citizenship/services/settle-canada/laws',
 'https://www.canada.ca/en/immigration-refugees-citizenship/services/application/application-forms-guides/cit0560',
 'https://www.canada.ca/en/immigration-refugees-citizenship/news/video/marriage-fraud-stories-victims',
 'https://www.canada.ca/en/immigration-refugees-citizenship/services/work-canada/permit/open-work-permit-hong-kong-recent-graduates/apply',
 'https://www.canada.ca/en/immigration-refugees-citizenship/services/application/applicati

In [None]:
# Headers we care about
valid_headers = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']

CONTENT_START_HEADER_TEXT = "You are here"
CONTENT_ENDS_HEADER_TEXT = "Page details"


for url in url_list[25:35]:
    print ("Extracting information from page: ", url)
    # Fetch the content from the url
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')


    # Iterate through elements and organize them
    content = []
    current_header = None
    start_scrape_flag = False

    for element in soup.body.descendants:
        if isinstance(element, Tag):
            
            #scrape only after the header  "You are here:" is found
            #skip all the text after "Page details"  
            
            if element.get_text().startswith(CONTENT_START_HEADER_TEXT):
                start_scrape_flag = True
            
            if element.get_text().startswith(CONTENT_ENDS_HEADER_TEXT):
                #stop reading the file
                break
                    
            if start_scrape_flag:
                if element.name in valid_headers:
                        
                        
                            header_text = element.get_text(strip=True)

                            if element.name in ['h1', 'h2']:
                                
                                header_text = "\n\n\n"+header_text
                            else:
                                header_text = "\n"+header_text
                                                    
                            current_header = {
                                "header": header_text,
                                "paragraphs": []
                            }
                            content.append(current_header)
                            
                            
                elif element.name == 'p' and current_header:
                    paragraph_text = element.get_text()
                    
                    if paragraph_text:
                        
                        # Extract urls using the <a> tag
                        a_tag = element.find('a')
                        if a_tag:
                            link_text = a_tag.text
                            href = a_tag['href']
                            p_tag = element.get_text('p')
                            paragraph_text+=f" (Refer page: {href})"
                            
                        current_header["paragraphs"].append("\n"+paragraph_text)
                                    
                        
                elif element.name == 'dt' and current_header:
                    paragraph_text = element.get_text(strip=True)
                    if paragraph_text:
                        current_header["paragraphs"].append("\n"+paragraph_text+": ")
                        
                
                        
                # extract all bullet points      
                elif current_header and element.name in ["ul", "ol"]:         
                    #bullet_text = element.get_text(strip=True)
                        
                    collected_data = [li.get_text() for li in element.find_all('li')]
                    collected_data = "\n - ".join(collected_data)
                    collected_data = "- " + collected_data #For first bullet
                    #print ("collected_data", collected_data)
                    current_header["paragraphs"].append(collected_data)


    # Save clean extracted content in docs
    page_name = url.split("https://www.canada.ca/en/")[1]
    page_name = "_".join(page_name.split("/"))
    fw = open(f"../data/docs/{page_name}.txt", "w")
    fw.write(f"This is content for {url}")

    for section in content:
        #print(f"\n{section['header']}")
        
        fw.write(f"\n{section['header']}")
        for para in section['paragraphs']:
            #print(f"  - {para}")
            fw.write(f"\n {para}")
            
    fw.close()


Extracting information from page:  https://www.canada.ca/en/immigration-refugees-citizenship/corporate/mandate
Extracting information from page:  https://www.canada.ca/en/immigration-refugees-citizenship/services/study-canada/study-permit/fmc-student-pilot/eligibility/participating-dlis
Extracting information from page:  https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/inadmissibility/reasons/medical-inadmissibility
Extracting information from page:  https://www.canada.ca/en/immigration-refugees-citizenship/corporate/publications-manuals/annual-reports-parliament-immigration
Extracting information from page:  https://www.canada.ca/en/immigration-refugees-citizenship/services/application/application-forms-guides/application-rehabilitation-inadmissible-persons-criminal-activity
Extracting information from page:  https://www.canada.ca/en/immigration-refugees-citizenship/services/application/application-forms-guides/imm5653
Extracting information from pag