In [88]:

import os
import json
import re
import string

from rake_nltk import Rake
import nltk
import spacy
import pytextrank
from tqdm.notebook import tqdm

In [89]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewghannoum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [90]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_lg")
# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x3005f4070>

In [91]:
def clean_string(text) -> str:
    final_string = ""

    # Make lower
    text = text.lower()

    # Remove line breaks
    # Note: that this line can be augmented and used over
    # to replace any characters with nothing or a space
    text = re.sub(r'\n', ' ', text)

    # Remove punctuation
    translator = str.maketrans(' ', ' ', string.punctuation)
    text = text.translate(translator)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']

    text_filtered = [word for word in text if not word in useless_words]

    # Remove numbers
    text_filtered = [re.sub(r'\w*\d\w*', '', w) for w in text_filtered]

    text_filtered = nlp(' '.join(text_filtered))
    text_stemmed = [y.lemma_ for y in text_filtered]

    final_string = ' '.join(text_stemmed)

    return final_string

In [92]:
# function that reads text file
def read_file(filepath: str) -> str:
    with open(filepath, "r") as f:
        return f.read()

In [93]:
usyd_test_subject = read_file(f"./data/subjects/usyd/COMP2017.txt")
uts_test_subject = read_file(f"./data/subjects/uts/31242.txt")

In [94]:
def get_text_section(text: str, start_line: str, end_line: str) -> str:
  description = ""
  is_description = False
  
  for line in text.split("\n"):
    if is_description:
      if line.strip() == "":
        continue
      
      if line == end_line:
        break
      description += line + "\n"
      continue
    
    if line == start_line:
      is_description = True
      
  return description

In [95]:
usyd_description = get_text_section(usyd_test_subject, "2024 unit information", "Unit details and rules")
uts_description = get_text_section(uts_test_subject, "Description", "Subject learning objectives (SLOs)")

In [96]:
usyd_slos = get_text_section(usyd_test_subject, "At the completion of this unit, you should be able to:", "Unit availability")
uts_slos = get_text_section(uts_test_subject, "Subject learning objectives (SLOs)", "Course intended learning outcomes (CILOs)")

In [97]:
uts_test_subject = clean_string(uts_description + "\n\n" +  uts_slos)
usyd_test_subject = clean_string(usyd_description +  "\n\n" + usyd_slos)

In [98]:
doc = nlp(uts_test_subject)
phrase_rank = [(phrase.text, phrase.rank) for phrase in doc._.phrases]
phrase_rank.sort(key=lambda x: x[1], reverse=True)

In [99]:
phrase_rank[:10]

[('transaction use web application', 0.12317288565236939),
 ('sophisticated web application deployment production subject',
  0.11676620401850937),
 ('medium sized web application', 0.1156558144462636),
 ('feature web base application system', 0.11348758388087492),
 ('student practice internet programming', 0.11215215625442283),
 ('contrast compete web application architecture list advantage disadvantage',
  0.10650539421645219),
 ('conceptual level multitier distribute web application component technology use',
  0.10488159375068266),
 ('multiple datum source transaction integrity datum application security',
  0.10270438163402332),
 ('web development stack ntier architecture standard transaction security dependency injection layering webservice integration deployment subject run simulation technologybased startup house software development project student',
  0.10189592735763967),
 ('successful completion subject student', 0.10058137723993472)]

In [100]:
doc = nlp(usyd_test_subject)
phrase_rank = [(phrase.text, phrase.rank, list(phrase.chunks)) for phrase in doc._.phrases]
phrase_rank.sort(key=lambda x: x[1], reverse=True)

In [101]:
phrase_rank[:10]

[('common unix tool manage aspect software construction process version control regression testing subject',
  0.0940700251393751,
  [common unix tool manage aspect software construction process version control regression testing subject]),
 ('standard link list datum structure high performance',
  0.0888826382464698,
  [standard link list datum structure high performance]),
 ('concurrent thread debugging tool technique',
  0.08799629729135139,
  [concurrent thread debugging tool technique]),
 ('error example code fixing use debugger',
  0.08631106180335711,
  [error example code fixing use debugger]),
 ('common programming error', 0.08616693776133381, [common programming error]),
 ('common memoryrelate error memory leak dangle pointer',
  0.08581231800785805,
  [common memoryrelate error memory leak dangle pointer]),
 ('high performance',
  0.08498104678511194,
  [high performance, high performance]),
 ('handle high performance', 0.08388062703009652, [handle high performance]),
 ('too

In [102]:
def get_top_n_keywords(doc, n):
    keywords = {}
  
    for phrase in doc._.phrases:
        phrase_text = phrase.text
        
        for keyword in phrase_text.split(" "):
          keywords[keyword] = keywords.get(keyword, 0) + 1
          
    if "" in keywords:
        del keywords[""]
          
    return [word[0] for word in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:n]]

In [103]:
subject_code_to_keywords = {}

for uni in ["uts", "usyd"]:  
  for filename in tqdm(os.listdir(f"./data/subjects/{uni}"), desc=f"Processing {uni} subjects"):
    if not filename.endswith(".txt"):
      continue
    
    filepath = f"./data/subjects/{uni}/{filename}"
    text = read_file(filepath)
    
    description = None
    slos = None
    
    if uni == "uts":
      description = get_text_section(text, "Description", "Subject learning objectives (SLOs)")
      slos = get_text_section(text, "Subject learning objectives (SLOs)", "Course intended learning outcomes (CILOs)")
    else:
      description = get_text_section(text, "2024 unit information", "Unit details and rules")
      slos = get_text_section(text, "At the completion of this unit, you should be able to:", "Unit availability")
    
    relevant_text = clean_string(description + " " + slos)
    
    doc = nlp(relevant_text)
    keywords = get_top_n_keywords(doc, 20)
    subject_code_to_keywords[filename.replace(".txt", "")] = keywords

Processing uts subjects:   0%|          | 0/265 [00:00<?, ?it/s]

Processing usyd subjects:   0%|          | 0/243 [00:00<?, ?it/s]

In [104]:
with open("./data/subjects/subject_keywords.json", "w") as f:
    json.dump(subject_code_to_keywords, f, indent=4)