In [1]:
import pandas as pd

# Load Data

In [2]:
df = pd.read_excel(
    '../data/CSC869 Term Project Dataset.xlsx', 
    sheet_name='Course Descriptions', 
    skiprows=3, 
    names=['School', 'MATH226 eqv', 'MATH226 mult eqv', 'MATH226 alt eqv', 'MATH226 alt mult eqv', 'CSC230 eqv', 'CSC230 alt eqv', 'CSC256 eqv', 'CSC256 multipleEqv'],
    index_col=None
)

## Fetch Data

In [3]:
def fetch_course_descriptions():
    math_226_columns = ['MATH226 eqv','MATH226 mult eqv', 'MATH226 alt eqv', 'MATH226 alt mult eqv']
    course_descriptions_math_226 = [df[column].tolist() for column in math_226_columns] 
    course_descriptions_math_226_string = " ".join(str(x) for x in course_descriptions_math_226)

    csc_230_columns = ['CSC230 eqv', 'CSC230 alt eqv']
    course_descriptions_csc_230 = [df[column].tolist() for column in csc_230_columns] 
    course_descriptions_csc_230_string = " ".join(str(x) for x in course_descriptions_csc_230)

    csc_256_columns = ['CSC256 eqv', 'CSC256 multipleEqv']
    course_descriptions_csc_256 = [df[column].tolist() for column in csc_256_columns] 
    course_descriptions_csc_256_string = " ".join(str(x) for x in course_descriptions_csc_256)
    
    return [course_descriptions_math_226_string, course_descriptions_csc_230_string, course_descriptions_csc_256_string]


## Trying with sklearn word extractor.

In [4]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

#Fetching data
[course_descriptions_math_226_string, course_descriptions_csc_230_string, course_descriptions_csc_256_string] = fetch_course_descriptions()

# Function to extract nouns and adjectives from a text
def extract_nouns_adjectives_verbs(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.pos_ in ["NOUN","VERB", "ADJ"]]
    return " ".join(tokens)

def skills_extractor(course_description):
    # Preprocess the course description
    processed_description = extract_nouns_adjectives_verbs(course_description)

    # Use TF-IDF Vectorizer to convert the text into numerical features
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([processed_description])

    # Use KMeans clustering to find potential clusters (topics)
    kmeans = KMeans(n_clusters=1)
    kmeans.fit(X)

    # Get the top terms (features) for each cluster
    terms = vectorizer.get_feature_names_out()
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

    # Extract skills from the top terms of the first cluster
    num_skills_to_extract = 30
    skills = [terms[ind] for ind in order_centroids[0, :num_skills_to_extract]]

    return skills
print("Extracted Skills Math 226:", skills_extractor(course_descriptions_math_226_string))
print("Extracted Skills CSC  230:", skills_extractor(course_descriptions_csc_230_string))
print("Extracted Skills CSC  256:", skills_extractor(course_descriptions_csc_256_string))

  super()._check_params_vs_input(X, default_n_init=10)


Extracted Skills Math 226: ['calculus', 'applications', 'functions', 'differentiation', 'limits', 'course', 'continuity', 'integration', 'grade', 'fundamental', 'integral', 'better', 'units', 'derivatives', 'hours', 'definite', 'trigonometric', 'first', 'n5', 'placement', 'credit', 'differential', 'derivative', 'including', 'math', 'integrals', 'students', 'area', 'problems', 'rule']


  super()._check_params_vs_input(X, default_n_init=10)


Extracted Skills CSC  230: ['discrete', 'logic', 'relations', 'theory', 'functions', 'computer', 'hours', 'science', 'mathematics', 'sets', 'proof', 'course', 'structures', 'topics', 'applications', 'probability', 'trees', 'include', 'mathematical', 'graphs', 'introduction', 'techniques', 'induction', 'units', 'grade', 'combinatorics', 'used', 'counting', 'lecture', 'recursion']
Extracted Skills CSC  256: ['language', 'assembly', 'computer', 'level', 'hours', 'data', 'programming', 'representation', 'architecture', 'machine', 'systems', 'organization', 'course', 'lecture', 'logic', 'high', 'internal', 'units', 'memory', 'system', 'introduction', 'interrupts', 'instructions', 'design', 'errors', 'operating', 'addressing', 'include', 'instruction', 'lab']


## Trying using Lightcast API

In [5]:
import requests

url = "https://auth.emsicloud.com/connect/token"

payload = "client_id=CLIENT_ID&client_secret=CLIENT_SECRET&grant_type=client_credentials&scope=emsi_open"
headers = {'Content-Type': 'application/x-www-form-urlencoded'}

response = requests.request("POST", url, data=payload, headers=headers)

print(response.text)

{"access_token":"eyJhbGciOiJSUzI1NiIsImtpZCI6IjNDNjZCRjIzMjBGNkY4RDQ2QzJERDhCMjI0MEVGMTFENTZEQkY3MUYiLCJ0eXAiOiJKV1QiLCJ4NXQiOiJQR2FfSXlEMi1OUnNMZGl5SkE3eEhWYmI5eDgifQ.eyJuYmYiOjE3MDI1NTg3NjksImV4cCI6MTcwMjU2MjM2OSwiaXNzIjoiaHR0cHM6Ly9hdXRoLmVtc2ljbG91ZC5jb20iLCJhdWQiOlsiZW1zaV9vcGVuIiwiaHR0cHM6Ly9hdXRoLmVtc2ljbG91ZC5jb20vcmVzb3VyY2VzIl0sImNsaWVudF9pZCI6InBtemh1c2N4bHkzcWJ5cmgiLCJlbWFpbCI6InBwYW5jaGFsQHNmc3UuZWR1IiwiY29tcGFueSI6IlNhbiBGcmFuY2lzY28gU3RhdGUgVW5pdmVyc2l0eSIsIm5hbWUiOiJQYXJ0aCBQYW5jaGFsIiwiaWF0IjoxNzAyNTU4NzY5LCJzY29wZSI6WyJlbXNpX29wZW4iXX0.gCvlrd-si-oJw54XMl154Uqxc1Zvc_XgCoTx36in2qYYSndMG_wmW5ZVA_onEYx5loo06RFZMOgXg4509ACF463DnfoWesGAj7fLo33k_1fZlmeuibPvsVHGKJpX5HWt_r4ohjGHCRY21euLt5MkDM0k0MfSXv8Vxns56u8jTKjo-ns5fx1xSurAv5fMYLLrxzSIVfv175lBhqD5ds0zQxWqf83YpGIljgYeBLX_fITEW5-b3BLboWJP9rJVVaUrtJWQxmJ_DGO3FZ9-TbnWFVgKaZrF1Kym51x63GO91cb_iayHyDQXGv6zMAyzDCECWG_mns7_B2qkJBeXWEIvvQ","expires_in":3600,"token_type":"Bearer","scope":"emsi_open"}


In [6]:
# Dropped this strategy since this only has 50 requests per month in free tier
# Link: https://docs.lightcast.dev/apis/skills#versions-version-extract
import requests
import json

#Fetching data
[course_descriptions_math_226_string, course_descriptions_csc_230_string, course_descriptions_csc_256_string] = fetch_course_descriptions()


#Sending request
url = "https://emsiservices.com/skills/versions/latest/extract"
querystring = {"language":"en"}
headers = {
    'Authorization': "Bearer BEARER_TOKEN",
    'Content-Type': "application/json"
}

math_226_payloadObject = {"text": course_descriptions_math_226_string, "confidenceThreshold": 0.6}
math_226_payload = json.dumps(math_226_payloadObject)
math_226_response = requests.request("POST", url, data=math_226_payload, headers=headers, params=querystring)

csc_230_payloadObject = {"text": course_descriptions_csc_230_string, "confidenceThreshold": 0.6}
csc_230_payload = json.dumps(csc_230_payloadObject)
csc_230_response = requests.request("POST", url, data=csc_230_payload, headers=headers, params=querystring)

csc_256_payloadObject = {"text": course_descriptions_csc_256_string, "confidenceThreshold": 0.6}
csc_256_payload = json.dumps(csc_256_payloadObject)
csc_256_response = requests.request("POST", url, data=csc_256_payload, headers=headers, params=querystring)

In [7]:
math_226_skills = [ x['skill']['name'] for x in json.loads(math_226_response.text)['data']]
print("Math 226")
print(math_226_skills)

csc_230_skills = [ x['skill']['name'] for x in json.loads(csc_230_response.text)['data']]
print("CSC 226")
print(csc_230_skills)

csc_256_skills = [ x['skill']['name'] for x in json.loads(csc_256_response.text)['data']]
print("CSC 226")
print(csc_256_skills)

Math 226
['Scientific Reasoning', 'Euclidean Geometry', 'Mathematics', 'Next Unit Of Computing (NUC)', 'Algebra', 'Numerical Analysis', 'Analytic Geometry', 'Nunit', 'Analytical Thinking', 'Geology', 'Mathematical Analysis', 'Data Science', 'Advanced Mathematics', 'Biology', 'CompTIA Network+', 'Trigonometry', 'Communication', 'Precalculus', 'Graphing Calculator', 'Assessment And Learning In Knowledge Spaces (ALEKS)', 'Logarithmic Functions', 'Physical Science', 'Physics', 'Registered Sleep Technologist', 'Calculus', 'Laboratory Experience', 'Mathematics Education', 'Geometry', 'Computer Science', 'Differential Equations', 'Natural Sciences', 'Differential Calculus', 'Integral Calculus', 'English Language', 'Economics', 'Derivatives', 'Nurse Practitioner (APRN-CNP)', 'Laurentz Contact Resonance', 'Differentials', 'Problem Solving', 'R (Programming Language)', 'Sketching']
CSC 226
['Scientific Reasoning', 'Digital Logic', 'Number Systems', 'Mathematics', 'Cryptography', 'Number Theory',

## Trying using Nesta

In [8]:
#Loading the Model

from ojd_daps_skills.pipeline.extract_skills.extract_skills import ExtractSkills #import the module

es = ExtractSkills(config_name="extract_skills_lightcast", local=True, multi_process=True) #instantiate with lightcast taxonomy configuration file
# es = ExtractSkills(config_name="extract_skills_toy", local=True) #instantiate with skills toy configuration file
# es = ExtractSkills(config_name="extract_skills_esco", local=True, multi_process=True) #instantiate with skills toy configuration file

es.load() #load necessary models


  from .autonotebook import tqdm as notebook_tqdm
[94;1;1m2023-12-14 05:01:05,800 - SkillsExtractor - INFO - Loading the model from a local location (ner_spacy.py:507)[0m
[94;1;1m2023-12-14 05:01:05,801 - SkillsExtractor - INFO - Loading the model from /home/parth/.local/lib/python3.10/site-packages/ojd_daps_skills_data/outputs/models/ner_model/20220825/ (ner_spacy.py:510)[0m
[94;1;1m2023-12-14 05:01:07,048 - SkillsExtractor - INFO - Loading 'lightcast' taxonomy information (extract_skills.py:151)[0m
[94;1;1m2023-12-14 05:01:07,576 - SkillsExtractor - INFO - Loaded 'lightcast' taxononmy skills (skill_ner_mapper.py:228)[0m
[94;1;1m2023-12-14 05:01:09,592 - SkillsExtractor - INFO - Preprocessed 'lightcast' taxononmy skills (skill_ner_mapper.py:241)[0m
[94;1;1m2023-12-14 05:01:09,593 - SkillsExtractor - INFO - Loading taxonomy embeddings from ojd_daps_skills_data/outputs/data/skill_ner_mapping/lightcast_embeddings.json (extract_skills.py:196)[0m
[94;1;1m2023-12-14 05:01:13,43

## Extracting skills from all colleges for a particular course

In [9]:
#Fetching Data
math_226_columns = ['MATH226 eqv','MATH226 mult eqv', 'MATH226 alt eqv', 'MATH226 alt mult eqv']
course_descriptions_math_226 = [df[column].tolist() for column in math_226_columns] 
course_descriptions_math_226_string = " ".join(str(x) for x in course_descriptions_math_226)

csc_230_columns = ['CSC230 eqv', 'CSC230 alt eqv']
course_descriptions_csc_230 = [df[column].tolist() for column in csc_230_columns] 
course_descriptions_csc_230_string = " ".join(str(x) for x in course_descriptions_csc_230)

csc_256_columns = ['CSC256 eqv', 'CSC256 multipleEqv']
course_descriptions_csc_256 = [df[column].tolist() for column in csc_256_columns] 
course_descriptions_csc_256_string = " ".join(str(x) for x in course_descriptions_csc_256)

#Getting skills
predicted_skills_math_226 = es.get_skills(course_descriptions_math_226_string) #extract skills from course description.
predicted_skills_csc_230 = es.get_skills(course_descriptions_csc_230_string) #extract skills from course description.
predicted_skills_csc_256 = es.get_skills(course_descriptions_csc_256_string) #extract skills from course description.

print("MATH 226")
print(predicted_skills_math_226[0]['SKILL'])
print("CSC 230")
print(predicted_skills_csc_230[0]['SKILL'])
print("CSC 256")
print(predicted_skills_csc_256[0]['SKILL'])

MATH 226
['Differentiation', 'Indefinite', 'substitution rule and applications', "', 'MATH 141", 'Trigonometry', 'Integration by substitution', 'Note', 'Students without recent credit in MATH 1030', 'curve sketching', 'Quantitative Reasoning', 'Satisfactory score of 78 or higher on Mathematics Placement Exam', 'Area under a curve.', 'anti-differentiation', 'indefinite integrals', 'Quantitative Reasoning', 'Quantitative Reasoning', 'MATH 1080', 'nintroduction', 'Calculus', 'analytic geometry', 'sketching', 'Primarily for science, technology, engineering and math majors', '', 'Evaluate the behavior of graphs', 'n.', 'solving applied real world problems', 'numerical approximation', 'graphing calculator', 'Data Science', 'Repeatable', 'analyzing nthe behavior', 'analyzing graphs', 'nindefinite integrals', 'C-ID# MATH', 'Indefinite', 'Calculus', 'MATH-108 with a minimum grade of C or appropriate placement', 'Total of 90 hours lecture', 'LCR', 'Letter', 'algebra', 'geometry', 'Proof of compl