In [65]:
import os
import json

from rake_nltk import Rake
import nltk
import spacy
import pytextrank
from tqdm.notebook import tqdm

In [48]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewghannoum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
# function that reads text file
def read_file(filepath: str) -> str:
    with open(filepath, "r") as f:
        return f.read()

In [50]:
usyd_test_subject = read_file(f"./data/subjects/usyd/COMP2017.txt")
uts_test_subject = read_file(f"./data/subjects/uts/31242.txt")

In [51]:
def get_text_section(text: str, start_line: str, end_line: str) -> str:
  description = ""
  is_description = False
  
  for line in text.split("\n"):
    if is_description:
      if line.strip() == "":
        continue
      
      if line == end_line:
        break
      description += line + "\n"
      continue
    
    if line == start_line:
      is_description = True
      
  return description

In [52]:
usyd_description = get_text_section(usyd_test_subject, "2024 unit information", "Unit details and rules")
uts_description = get_text_section(uts_test_subject, "Description", "Subject learning objectives (SLOs)")

In [53]:
usyd_slos = get_text_section(usyd_test_subject, "At the completion of this unit, you should be able to:", "Unit availability")
uts_slos = get_text_section(uts_test_subject, "Subject learning objectives (SLOs)", "Course intended learning outcomes (CILOs)")

In [54]:
uts_test_subject = uts_description + "\n\n" +  uts_slos
usyd_test_subject = usyd_description +  "\n\n" + usyd_slos

In [55]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")
# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x106ac0b50>

In [56]:
doc = nlp(uts_test_subject)
phrase_rank = [(phrase.text, phrase.rank) for phrase in doc._.phrases]
phrase_rank.sort(key=lambda x: x[1], reverse=True)

In [57]:
phrase_rank[:10]

[('web applications', 0.12992024081174047),
 ('sophisticated web applications', 0.12437740086772171),
 ('competing web application architectures', 0.12326220518012279),
 ('web development stacks', 0.10477504250270292),
 ('application security', 0.10226389515474604),
 ('such applications', 0.09351745673744058),
 ('Students', 0.08368777241259946),
 ('students', 0.08368777241259946),
 ('a web based application system', 0.07970956794192767),
 ('tier and distributed web applications', 0.07804403219188612)]

In [58]:
doc = nlp(usyd_test_subject)
phrase_rank = [(phrase.text, phrase.rank) for phrase in doc._.phrases]
phrase_rank.sort(key=lambda x: x[1], reverse=True)

In [59]:
phrase_rank[:10]

[('common Unix tools', 0.10219663484047473),
 ('such errors', 0.09228956572237379),
 ('common programming errors', 0.0868127086937746),
 ('memory usage patterns', 0.0780810399632048),
 ('debugging tools', 0.07758519454799759),
 ('code quality strategies', 0.07719768746746404),
 ('example code', 0.07648587377369842),
 ('tools', 0.07467431525841427),
 ('memory management', 0.07465430274100611),
 ('memory leaks', 0.0745406201608866)]

In [63]:
subject_code_to_keywords = {}

for uni in ["uts", "usyd"]:
  start_line = "2024 unit information" if uni == "usyd" else "Description"
  end_line = "Unit details and rules" if uni == "usyd" else "Subject learning objectives (SLOs)"
  
  for filename in tqdm(os.listdir(f"./data/subjects/{uni}"), desc=f"Processing {uni} subjects"):
    if not filename.endswith(".txt"):
      continue
    
    filepath = f"./data/subjects/{uni}/{filename}"
    text = read_file(filepath)
    
    relevant_text = get_text_section(text, start_line, end_line)
    
    doc = nlp(relevant_text)
    phrase_rank = [(phrase.text, phrase.rank) for phrase in doc._.phrases]
    phrase_rank.sort(key=lambda x: x[1], reverse=True)
    subject_code_to_keywords[filename.replace(".txt", "")] = [phrase[0] for phrase in phrase_rank[:10]]

Processing uts subjects:   0%|          | 0/265 [00:00<?, ?it/s]

Processing usyd subjects:   0%|          | 0/243 [00:00<?, ?it/s]

In [67]:
with open("./data/subjects/subject_keywords.json", "w") as f:
    json.dump(subject_code_to_keywords, f, indent=4)