# Courses and Teachers!

In [None]:
# first, import the re module to use python's regular expressions
import re

In [None]:
with open('datasets/ahs23-24/courses.txt', 'r') as fp:
    courses_upper = fp.read()
courses_upper

In [None]:
# find a six-character run of capital/number surrounded by whitespace \s
pattern = "\s([A-Z0-9]{6})\s"
# find credits, course, and symbol
pattern = r"(1.0|0.5|0.25) ([\w\s\(\)\&\-\/]+(?:Req\.\))?) ([A-Z0-9]{5,6})"
courses = re.findall(pattern, courses_upper)
print(len(courses), "courses matched")
courses

In [None]:
with open('datasets/ahs23-24/courses-frosh.txt', 'r') as fp:
    courses_frosh = fp.read()
courses_frosh

In [None]:
# find a six-character run of capital/number surrounded by whitespace \s
pattern = "\s([A-Z0-9]{6})\s"
# find credits, course, and symbol
pattern = r"(1.0|0.5|0.25) ([\w\s\(\)\&\-\/]+(?:Req\.\))?) ([A-Z0-9]{5,6})"
courses = re.findall(pattern, courses_frosh)
print(len(courses), "courses matched")
courses

In [None]:
with open('datasets/ahs23-24/skyward-courses.txt', 'r') as fp:
    data = fp.read()
data


In [None]:
# find a six-character run of capital/number followed by a \t (tab character)
pattern = r"([A-Z0-9]{6})\t(.*)\n([\d\.]+) Credits,  (Elective|Required)"
# find symbol, course, and credits, and elective/required
courses = re.findall(pattern, data)
print(len(courses), "courses matched")
courses

In [None]:
def parse_courses():
    courses = []
    with open('datasets/ahs23-24/skyward-courses.txt', "r") as file:
        coursedata = file.read()
    # group name syntax: https://docs.python.org/3/library/re.html#re.Match.group
    for item in re.finditer('(?P<key>[A-Z0-9]{6})\t(?P<course>.+)\n(?P<credits>[\d\.]+) Credits,  (?P<type>Elective|Required)', coursedata):
        foo = item.groupdict()
        foo['credits'] = float(foo['credits'])
        courses.append(foo)
    return courses

courses = parse_courses()
print(len(courses), "courses matched")
courses

In [None]:
import re
import csv

def parse_filename_with_regex(filename, regex):
    courses = []
    with open(filename, "r") as file:
        coursedata = file.read()
    
    for item in regex.finditer(coursedata):
        course = item.groupdict()
        course['credits'] = float(course['credits'])
        courses.append(course)
    return courses

skyward_regex = re.compile(r"(?P<key>[A-Z0-9]{6})\t(?P<course>.+)\n(?P<credits>[\d\.]+) Credits,  (?P<type>Elective|Required)")
ahs_regex = re.compile(r"(?P<credits>1.0|0.5|0.25) (?P<course>[\w\s\(\)\&\-\/]+(?:Req\.\))?) (?P<key>[A-Z0-9]{5,6})")
skyware_file = 'datasets/ahs23-24/skyward-courses.txt'
courses_frosh_file = 'datasets/ahs23-24/courses-frosh.txt'
courses_file = 'datasets/ahs23-24/courses.txt'

all_courses = []
all_courses.extend(parse_filename_with_regex(skyware_file,skyward_regex))
all_courses.extend(parse_filename_with_regex(courses_file,ahs_regex))
all_courses.extend(parse_filename_with_regex(courses_frosh_file,ahs_regex))

outfile = 'datasets/courses.csv'
with open(outfile, 'w') as csvfile:
    fieldnames = ['key','course','credits','type']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for course in sorted(all_courses, key=lambda course: course['key']):
        writer.writerow(course)

print("wrote",len(all_courses),"courses to",outfile)

What can we discover with this data?
- How many teachers does the school need to run this course load?  How many periods do they teach?
- If we were going to schedule classes, what algorithm can we imagine?
- How do we de-dupe this data?

In [None]:
# let's group courses by key prefix (excluding A/B suffix)
course_prefix = dict()
max_for_prefix = 0
for course in all_courses:
    prefix = course['key']
    if prefix[-1] in "AB":
        prefix = prefix[:-1]
    course_prefix[prefix] = course_prefix.get(prefix, [])
    course_prefix[prefix].append(course)
    if len(course_prefix[prefix]) > max_for_prefix:
        max_for_prefix = len(course_prefix[prefix])

# maximum number of courses for prefix
print("max courses in a group:", max_for_prefix) 
# how many courses now?
print("courses:", len(course_prefix))
course_prefix

In [None]:
# for a group of courses, remove duplicates
def dedupe_course(courses):
    # map courses by key, check types are the same
    merged = dict()
    types = set()
    for c in courses:
        if 'type' in c:
            types.add(c['type'])
            del c['type']
        merged[c['key']] = merged.get(c['key'],[])
        merged[c['key']].append(c)
    # courses should not be both required and elective
    assert len(types) <= 1
    assert len(merged) <= 3 # A, B, both
    course_type = None
    if len(types) == 1:
        course_type = types.pop()
    # consolidate same-key courses into one course with aliases and minimum credits    
    # because 1.0 course is often coded with 1st semester key
    for course_key in merged:
        course = merged[course_key][0]        
        for alias in merged[course_key][1:]:
            course['credits'] = min(course['credits'], alias['credits'])
            if alias['course'] != course['course']:
                course['alias'] = course.get('alias', [])
                if alias['course'] not in course['alias']:
                    course['alias'].append(alias['course'])
        if course_type:
            course['type'] = course_type
        merged[course_key] = course
    # link courses as related
    distinct_courses = list(merged.values())
    related_keys = set([c['key'] for c in distinct_courses])
    for course in distinct_courses:
        course['related'] = [k for k in related_keys if k != course['key']]
    if len(course['related']) == 0:
        del course['related']
    return distinct_courses

# now we should have max 3 courses pre prefix
max_for_prefix = 0
for prefix in course_prefix:
    course_prefix[prefix] = dedupe_course(course_prefix[prefix])
    if len(course_prefix[prefix]) > max_for_prefix:
        max_for_prefix = len(course_prefix[prefix])    

max_for_prefix

In [None]:
from pprint import pprint
# or max 2, except for one exception:
for prefix in course_prefix:
    if len(course_prefix[prefix]) > 2:
        pprint(course_prefix[prefix])

In [None]:
# output our cleaned data
outfile = 'datasets/courses2.csv'
with open(outfile, 'w') as csvfile:
    fieldnames = ['key','course','alias','credits','type', 'related']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for prefix in sorted(course_prefix):
        for course in course_prefix[prefix]:
            writer.writerow(course)

print("wrote courses to",outfile)