## Testing notebook to fetch course data from Laurier

In [1]:
# import libraries
from pymongo import MongoClient
import os
from dotenv import load_dotenv

from bs4 import BeautifulSoup

import requests

import re

In [2]:
load_dotenv()

client = MongoClient(os.getenv('MONGO_URL'))

### Our objective
We need to fetch course descriptions data from the Laurier undergraduate calendar (for now, just business courses) and store them into our database.

[Link to calendar](https://academic-calendar.wlu.ca/department.php?cal=1&d=2453&s=1002&y=83)


In [3]:
# main course listing
url = "https://academic-calendar.wlu.ca/department.php?cal=1&d=2453&s=1002&y=83"

# function to get course link (to undergrad calendar)
def get_course_link(subjectCode, catalogNumber, termMonth, termYear):
    return f"https://loris.wlu.ca/ssb_prod/bwckctlg.p_disp_course_detail?cat_term_in={termYear}{termMonth}&subj_code_in={subjectCode}&crse_numb_in={catalogNumber}"

In [4]:
get_course_link('BUS', '111', '09', '2021')

'https://loris.wlu.ca/ssb_prod/bwckctlg.p_disp_course_detail?cat_term_in=202109&subj_code_in=BUS&crse_numb_in=111'

In [5]:
# get site html
req = requests.get(url)
main_soup = BeautifulSoup(req.text, 'html.parser')

In [6]:
# get course offerings table
course_offerings = main_soup.find_all('div', {'class': 'reqs'})[-1].find_all('tr')
# we query all rows after first row since that is the 'header' row
# then, get links for all the different course pages
course_list = course_offerings[1:]
course_list

[<tr class="zebra">
 <td><a href="course.php?c=60634&amp;cal=1&amp;d=2453&amp;s=1002&amp;y=83">BU111</a></td>
 <td>Understanding the Business Environment</td>
 <td>0.5</td>
 </tr>,
 <tr>
 <td><a href="course.php?c=60635&amp;cal=1&amp;d=2453&amp;s=1002&amp;y=83">BU121</a></td>
 <td>Functional Areas of the Organization</td>
 <td>0.5</td>
 </tr>,
 <tr class="zebra">
 <td><a href="course.php?c=60638&amp;cal=1&amp;d=2453&amp;s=1002&amp;y=83">BU127</a></td>
 <td>Introduction to Financial Accounting</td>
 <td>0.5</td>
 </tr>,
 <tr>
 <td><a href="course.php?c=60636&amp;cal=1&amp;d=2453&amp;s=1002&amp;y=83">BU208</a></td>
 <td>Interpersonal Communication <a class="senateRef" href="#se_1">[1]</a></td>
 <td>0.5</td>
 </tr>,
 <tr class="zebra">
 <td><a href="course.php?c=60637&amp;cal=1&amp;d=2453&amp;s=1002&amp;y=83">BU223</a></td>
 <td>Fundamentals of Finance</td>
 <td>0.5</td>
 </tr>,
 <tr>
 <td><a href="course.php?c=60723&amp;cal=1&amp;d=2453&amp;s=1002&amp;y=83">BU225</a></td>
 <td>Fundamenta

In [7]:

course_codes = [x.find_all('a')[0].text[2:] for x in course_list]
# get course numbers 
course_numbers = [x[2:] for x in course_codes]
# next, for each link in course_links:
subjectCode = 'BU' # fixed for now
catalogNumber = '398' # testing
termMonth, termYear = '09', '2021' # Fall 2021

url = get_course_link(subjectCode, catalogNumber, termMonth, termYear)
req = requests.get(url)
course_soup = BeautifulSoup(req.text, 'html.parser')
    

In [8]:
# next, for each link in course_links:
subjectCode = 'BU' # fixed for now
catalogNumber = '398' # testing
termMonth, termYear = '09', '2021' # Fall 2021

# initialize list of courses
courses = []

for catalogNumber in course_codes:

    print(f"{subjectCode} {catalogNumber}")
    
    url = get_course_link(subjectCode, catalogNumber, termMonth, termYear)
    req = requests.get(url)
    course_soup = BeautifulSoup(req.text, 'html.parser')

    course_info = course_soup.find('table', {'class': 'datadisplaytable'}).find_all('tr')

    # initialize course object
    course = {}

    # initialize course subject code and catalog number
    course['subjectCode'] = 'BUS'
    course['catalogNumber'] = catalogNumber + 'W'

    # add associated academic career (ie undergrad or grad)
    course['associatedAcademicCareer'] = 'UG'

    # course title is stored in class 'nttitle'
    course_title = course_info[0].find('td', {'class': 'nttitle'}).text.split('-')[1].strip()
    course['title'] = course_title

    # course description is stored in class 'ntdefault'
    course_description = str(course_info[1].find('td', {'class': 'ntdefault'}))
    # get the 'text blocks' by splitting on <br/>
    course_description = course_description.replace('<td class="ntdefault">', '').replace('</td>', '')

    # split the html text based on <br/>
    text_blocks = re.split('(<br\/>)|(<br>)|(<\/br>)', course_description)
    # filter empty strings
    text_blocks = [x for x in text_blocks if x and not re.match('(<br\/>)|(<br>)|(<\/br>)', x)]
    # remove tabs, newlines and trailing spaces from each string in text blocks
    text_blocks = [x.replace('\n', '').replace('\t', '').strip() for x in text_blocks]
    # filter empty strings again
    text_blocks = [x for x in text_blocks if x]

    text_blocks

    # course
    # description of course is first element of text_blocks
    course['description'] = text_blocks[0]

    # the prerequisites are the element of text_blocks that contains the string 'Prerequisites:'
    try: 
        prereqs_string = [x for x in text_blocks if 'Prerequisites:' in x][0]
        prereq_soup = BeautifulSoup(prereqs_string, 'html.parser')
        prereq_text = prereq_soup.text.replace('Prerequisites:', '').replace('\n', '').replace('\t', '').strip()
    except: # if no prereqs
        prereq_text = None

    # similarly, the antirequisites are the element of text_blocks that contains the string 'Exclusions:'
    try:
        antireqs_string = [x for x in text_blocks if 'Exclusions:' in x][0]
        antireq_soup = BeautifulSoup(antireqs_string, 'html.parser')
        antireq_text = antireq_soup.text.replace('Exclusions:', '').replace('\n', '').replace('\t', '').strip()
    except: # if no antireqs
        antireq_text = None

    # make requirements description
    course['requirementsDescription'] = f'{f"Prereq: {prereq_text}" if prereq_text else ""} {f"Antireq: {antireq_text}" if antireq_text else ""}'

    # add course to courses list
    courses += [course]

BU 111
BU 121
BU 127
BU 208
BU 223
BU 225
BU 231
BU 233
BU 247
BU 275
BU 283
BU 288
BU 321
BU 352
BU 353
BU 354
BU 357
BU 362
BU 375
BU 387
BU 393
BU 397
BU 398
BU 400
BU 401
BU 403
BU 405
BU 408
BU 409
BU 410
BU 411
BU 412
BU 413
BU 414
BU 415
BU 416
BU 417
BU 418
BU 419
BU 421
BU 422
BU 423
BU 425
BU 429
BU 430
BU 431
BU 432
BU 433
BU 435
BU 442
BU 443
BU 445
BU 447
BU 448
BU 449
BU 451
BU 452
BU 453
BU 455
BU 459
BU 460
BU 461
BU 462
BU 463
BU 464
BU 466
BU 467
BU 468
BU 469
BU 470
BU 471
BU 472
BU 473
BU 474
BU 477
BU 479
BU 480
BU 481
BU 482
BU 483
BU 485
BU 486
BU 487
BU 488
BU 489
BU 490
BU 491
BU 492
BU 493
BU 495
BU 496
BU 497
BU 498
BU 499
BU TR100


AttributeError: 'NoneType' object has no attribute 'text'

In [9]:
courses

[{'subjectCode': 'BUS',
  'catalogNumber': '111W',
  'associatedAcademicCareer': 'UG',
  'title': 'Understanding Bus. Environment',
  'description': 'This course highlights challenges in the external business environment with a focus on their impact on business decisions. Topics include competitive analysis, and an in-depth examination of political, economic, social, and technological factors. Students will apply their knowledge and strengthencritical thinking and analytical skills using cases, identifying and researching the feasibility of business solution proposals. The course includes a required weekly lab in which research, writing, teamwork, and individual and group presentation skills are also developed.',
  'requirementsDescription': ' Antireq: MB105.'},
 {'subjectCode': 'BUS',
  'catalogNumber': '121W',
  'associatedAcademicCareer': 'UG',
  'title': 'Functional Areas',
  'description': 'This course provides an overview to the functional areas of a business; specifically market

In [11]:
# add courses to mongodb database
# collection = client['waterloo']['courses-descriptions']
# collection.delete_many({'subjectCode': 'BUS'})
# res = collection.insert_many(courses)