In [24]:
#import packages
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [25]:
#create dataframe to store all the information retrieved from the webpages
df_information = pd.DataFrame(columns =['title', 'description', 'department', 'updated'])

In [27]:
#Function that retrieves the next page
def find_the_next_page(soup):
    string = 'https://www.gov.uk/'
    #Extract the path to the next page
    a_element = soup.find('div', class_='govuk-pagination__next').find('a')
    #filter out the data_track_label
    data_track_label = a_element.get('data-track-label')
    #add https://www.gov.uk/ to the string to get the new link
    new_url = string + data_track_label
    return  new_url

In [28]:
#Function that stores relevant information in a dataframe for easier access
def store_information(titles, description, attributes):
    list_titles = [item.text for item in titles]
    list_description = [item.text for item in description]
    list_attributes = [item.text for item in attributes]

    list_attr_department = [list_attributes[i] for i in range(0, len(list_attributes), 2)]
    list_attr_update = [list_attributes[i] for i in range(1, len(list_attributes), 2)]
        
    data = list(zip(list_titles, list_description,list_attr_department, list_attr_update))

    # Creating a DataFrame
    df_information_wp = pd.DataFrame(data, columns=['title', 'description', 'department', 'updated'])  

    return   df_information_wp

In [30]:
#Webscraping function
def scrape_website(start_url, max_pages, df_store):
    current_url = start_url
    page_count = 0

    for page_count in range(max_pages):
        # Fetch the content of the current page
        response = requests.get(current_url)
        if response.status_code != 200:
            print(f"Failed to retrieve page {current_url}")
            break
        
        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        #Extract relevant information from the data
        titles_documents = soup.find_all('div', class_='gem-c-document-list__item-title') #returns the titel of the documents listed
        item_description = soup.find_all('p', class_='gem-c-document-list__item-description')#Short description of the article
        item_attribute = soup.find_all('li', class_='gem-c-document-list__attribute') #each document has two attributes: 1 Department 2 last updated

        df_info_new_wp = store_information(titles_documents,item_description,item_attribute)
        
        #append dataframe
        df_store = pd.concat([df_store,df_info_new_wp])
        
        #Find the URL of the next page
        next_page_url = find_the_next_page(soup)
    
    return df_store

In [31]:
start_url = "https://www.gov.uk//search/policy-papers-and-consultations"
df_complete_info = scrape_website(start_url, 10, df_information)
