In [9]:
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from urllib.parse import urljoin
import requests
import re
import random
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pprint

In [10]:
url = 'http://hrlibrary.umn.edu/instree/ainstls1.htm'

In [11]:
def scrape_first_links(url: str) -> list:
    """Scrapes and returns a list of href attributes from the first link in each <td> element on the given webpage.

    Args:
        url (str): The URL of the webpage to scrape.

    Raises:
        Exception: Raised if the HTTP request to the URL is unsuccessful (status code other than 200).

    Returns:
        list: A list of href attributes from the first link in each <td> element.
    """
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve the page. Status code: {response.status_code}")
        
    soup = BeautifulSoup(response.text, 'html.parser')
    td_elements: ResultSet[Tag] = soup.find_all('td')
    
    links_list: list = []

    # Extract the href attribute of the first link in each td
    for td in td_elements[1:]:
        link: Tag = td.find('a')
        
        if link is not None:
            href: str = link.get('href')
            
            if href is not None:
                links_list.append(href)
                
    return links_list

In [19]:
links_list = scrape_first_links(url)

In [20]:
len(links_list)

408

In [22]:
# Iterate through a copy of the 'links_list' to avoid modifying it during iteration.
# For each 'link' in the list, check if it does not end with '.html' or '.htm'.
# If the condition is met, add the 'link' to the 'links_removed' list and remove it
# from the original 'links_list'. This process filters out links that do not have
# the specified file extensions.

links_removed = filter_links(links_list)

In [27]:
len(links_list), len(links_removed)

(366, 42)

In [25]:
links_removed

['/instree/bejing-furtheractions.pdf',
 '/instree/UnicefGuidelines2004.doc',
 'http://www.who.int/mip/2003/other_documents/en/Ethical_Safety-GWH.pdf',
 'http://www.belgium.iom.int/STOPConference/Conference%20Papers/brudeclaration.pdf',
 'http://polis.osce.org/library/f/2686/499/CoE-FRA-RPT-2686-EN-Council%20of%20Europe%20Committee%20of%20Ministers%20Recommendation%20R(2000)%2011.pdf',
 '/instree/UnicefGuidelines2004.doc',
 'http://www.who.int/mip/2003/other_documents/en/Ethical_Safety-GWH.pdf',
 'A-HRC-13-42.pdf',
 '/instree/RobbenIslandGuidelines.pdf',
 'training8Rev1en.pdf',
 'A-HRC-13-42.pdf',
 'training8Rev1en.pdf',
 'http://transitionaljustice.ulster.ac.uk/documents/TheBelfastGuidelinesFINAL_000.pdf',
 'http://www.unicef.org/emerg/files/Cape_Town_Principles.pdf',
 '../Mill_dev_goals2015.pdf',
 'HREandTrainingSeptember2009.pdf',
 'indigenous%20issues.pdf',
 'http://www.unece.org/env/pp/',
 'http://www.biodiv.org/default.shtml',
 'http://www.unep.org/Documents.multilingual/Default.a

In [36]:
links_list

['principles1970.html',
 '/instree/b1udhr.htm',
 '/instree/b2esc.htm',
 'opt-prot08.html',
 '/instree/b3esc.html',
 '/instree/b3ccpr.htm',
 '/instree/b4ccprp1.htm',
 '/instree/b5ccprp2.htm',
 '/instree/siracusaprinciples.html',
 'c1dgiccp.htm',
 'c2psnr.htm',
 '/instree/d1cerd.htm',
 '/instree/d2drp.htm',
 '/instree/d3dfpmms.htm',
 '/instree/d4deidrb.htm',
 '/instree/d5drm.htm',
 '/instree/decl-tol.html',
 '/instree/apartheid-supp.html',
 '/instree/apartheid-sports.html',
 '/instree/wcarprogrammeofaction.html',
 '/instree/wcardeclaration.html',
 'COE_gen-rec-11.html',
 '/osce/basics/lund-1999.html',
 '/osce/basics/oslo-1998.html',
 '/osce/basics/hague-1996.html',
 '/instree/YogyakartaPrinciples.html',
 '/instree/e1cedaw.htm',
 '/instree/cedawopprot-2000.html',
 '/instree/e2cprw.htm',
 '/africa/protocol-women2003.html',
 '/instree/e3dpwcea.htm',
 '/instree/e4devw.htm',
 '/instree/bejingmnu.htm',
 '/instree/bejing-followup.html',
 '/instree/organizedcrime.html',
 '/instree/trafficking.ht

In [29]:
# Generate a list of complete URLs by joining the base 'url' with each 'route' in the 'links_list'.
# This uses the 'urljoin' function to ensure proper URL concatenation, handling relative paths
# and creating a list of fully formed URLs for further processing.
full_urls = [urljoin(url, route) for route in links_list]

In [30]:
full_urls

['http://hrlibrary.umn.edu/instree/principles1970.html',
 'http://hrlibrary.umn.edu/instree/b1udhr.htm',
 'http://hrlibrary.umn.edu/instree/b2esc.htm',
 'http://hrlibrary.umn.edu/instree/opt-prot08.html',
 'http://hrlibrary.umn.edu/instree/b3esc.html',
 'http://hrlibrary.umn.edu/instree/b3ccpr.htm',
 'http://hrlibrary.umn.edu/instree/b4ccprp1.htm',
 'http://hrlibrary.umn.edu/instree/b5ccprp2.htm',
 'http://hrlibrary.umn.edu/instree/siracusaprinciples.html',
 'http://hrlibrary.umn.edu/instree/c1dgiccp.htm',
 'http://hrlibrary.umn.edu/instree/c2psnr.htm',
 'http://hrlibrary.umn.edu/instree/d1cerd.htm',
 'http://hrlibrary.umn.edu/instree/d2drp.htm',
 'http://hrlibrary.umn.edu/instree/d3dfpmms.htm',
 'http://hrlibrary.umn.edu/instree/d4deidrb.htm',
 'http://hrlibrary.umn.edu/instree/d5drm.htm',
 'http://hrlibrary.umn.edu/instree/decl-tol.html',
 'http://hrlibrary.umn.edu/instree/apartheid-supp.html',
 'http://hrlibrary.umn.edu/instree/apartheid-sports.html',
 'http://hrlibrary.umn.edu/inst

In [31]:
# Iterate through each URL in the 'full_urls' list and send an HTTP GET request.
# If the response status code is not 200 (OK), print the URL and status code,
# and remove the problematic URL from the list to prevent further processing.
for link in full_urls:
    response = requests.get(link)
    
    if response.status_code != 200:
        print(link, response.status_code)
        full_urls.remove(link)

http://europa.eu/scadplus/leg/en/lvb/l33137.htm 404
http://hrlibrary.umn.edu/oastinstr/managua1996.html 404
http://hrlibrary.umn.edu/instree/iachrregulations.html 404
http://hrlibrary.umn.edu/instree/environment2003.html 404


In [32]:
len(full_urls)

362

In [57]:
# Save the full_urls to a file
output_file_path = "data/full_urls.txt"
with open(output_file_path, 'w') as file:
    for full_url in full_urls:
        file.write(full_url + '\n')