In [9]:
# Import libraris
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import json
import os
import numpy as np
from requests.models import MissingSchema
import trafilatura


In [2]:
base_url = 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/'
page_source = requests.get(base_url)
soup = BeautifulSoup(page_source.content, 'html.parser',from_encoding="utf-8")

In [3]:
# Define Function for data extraction
def beautifulsoup_extract_text_fallback(response_content):
    
    '''
    This is a fallback function, so that we can always return a value for text content.
    Even for when both Trafilatura and BeautifulSoup are unable to extract the text from a 
    single URL.
    '''
    
    # Create the beautifulsoup object:
    soup = BeautifulSoup(response_content, 'html.parser',from_encoding="utf-8")
    
    # Finding the text:
    text = soup.find_all(text=True)

    # Remove unwanted tag elements:
    cleaned_text = ''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style',]

    # Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
    # is NOT in the blacklist
    for item in text:
        if item.parent.name not in blacklist:
            cleaned_text += '{} '.format(item)
            
    # Remove any tab separation and strip the text:
    cleaned_text = cleaned_text.replace('\t', '')
    cleaned_text = re.sub(r"[^a-z A-Z 0-9$£]","",cleaned_text)
    return cleaned_text.strip()
    

def extract_text_from_single_web_page(url):
    
    downloaded_url = trafilatura.fetch_url(url)
    try:
        a = trafilatura.extract(downloaded_url, include_links=True,deduplicate=True,output_format='json', with_metadata=True, include_comments = False,
                            date_extraction_params={'extensive_search': True, 'original_date': True})
    except AttributeError:
        a = trafilatura.extract(downloaded_url, include_links=True,deduplicate=True,output_format='json', with_metadata=True,
                            date_extraction_params={'extensive_search': True, 'original_date': True})
    if a:
        json_output = json.loads(a)
        return json_output['text']
    else:
        try:
            resp = requests.get(url)
            # We will only extract the text from successful requests:
            if resp.status_code == 200:
                return beautifulsoup_extract_text_fallback(resp.content)
            else:
                # This line will handle for any failures in both the Trafilature and BeautifulSoup4 functions:
                return np.nan
        # Handling for any URLs that don't have the correct protocol
        except MissingSchema:
            return np.nan

In [4]:
# Collect all urls
raw_urls = []
for link in soup.find_all('a'):          
    raw_urls.append(link.get('href'))
raw_urls = list(raw_urls)
raw_urls

['https://www.myloyalist.com',
 'https://www.loyalistcollege.com',
 '/future-students/book-a-visit/',
 '#',
 '#',
 '/programs-and-courses/full-time-programs',
 '/programs-and-courses/full-time-programs',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/',
 'https://www.loyalistcollege.com/programs-and-courses/january-program-starts/',
 'https://www.loyalistcollege.com/programs-and-courses/post-graduate-programs/',
 'https://www.loyalistcollege.com/focus/',
 'https://www.loyalistcollege.com/programs-and-courses/apprenticeships/',
 '/programs-and-courses/full-time-programs/career-and-college-prep/',
 'https://www.loyalistbanner.com/PROD/cewkcrss.P_IndexPage#new_tab',
 'https://www.loyalistcollege.com/programs-and-courses/elab/',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/general-education-electives/',
 'https://www.loyalistcollege.com/programs-and-courses/justice-studies/',
 '/programs-and-courses/training-and-certification',
 '/pro

In [5]:
#take relevent urls
sub_urls = []
base = 'https://www.loyalistcollege.com'
pattern ='^\/programs-and-courses\/full-time-programs\/.+'
for link in raw_urls:
    if link != None:
        if re.search(pattern, link):
            sub_urls.append(base + link)
sub_urls = set(sub_urls)
sub_urls = list(sub_urls)
sub_urls.sort()

In [6]:
sub_urls

['https://www.loyalistcollege.com/programs-and-courses/full-time-programs/advanced-filmmaking-digital-content-creation',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/advertising-and-marketing-communications-creative-design',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/animation-and-game-development',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/architectural-technician',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/artificial-intelligence-and-data-science',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/automotive-service-technician-apprenticeship',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/biotechnology',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/business',
 'https://www.loyalistcollege.com/programs-and-courses/full-time-programs/business-accounting',
 'https://www.loyalistcollege

In [7]:
# extract text
text_content = [extract_text_from_single_web_page(url) for url in sub_urls]

In [8]:
text_content[0]

"Advanced Filmmaking – Digital Content Creation\n- Credential\nOne-year Ontario College Graduate Certificate\n- Start Date\nSeptember entry\n- Location\nBelleville Campus\nFind Your Career\nGain the knowledge and skills needed to find employment in the motion picture and video industry:\n- Film camera operator\n- Motion picture camera operator\n- Studio camera operator\n- Sound effects operator\n- Gaffer\n- Director\n- Assistant director\n- Production manager\n- Production department head\n- Business operator\nIs it for you?\nYou may enjoy filmmaking and digital content creation if you are the kind of person who:\n- Enjoys working on short-term projects\n- Likes to travel and work in a variety of locations\n- Keeps up-to-date with current industry equipment and new technologies\n- Recognizes the importance of marketing, networking, relationship building, interacting and collaborating with other industry professionals\n- Is a self-starter who enjoys being in control of their own earning

In [None]:
# json_txt = json.dumps(text_content, indent=2)
# Serializing json 
json_object = json.dumps(text_content, indent = 2)
  
# Writing to sample.json
with open("loyalist_colleges.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
one_txt = extract_text_from_single_web_page("base_url")

In [None]:
# directory = "Scrapped_data"
# if not os.path.exists(directory):
#     os.mkdir(directory)
# with open(directory+"/"+"scrapped_loyalist_college.txt",'w') as file:
#     file.write(text_content)