In [1]:
import pandas as pd
import numpy as np
from numpy import empty
import random
import string
import spacy
from spacy.util import minibatch, compounding
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import gensim



In [2]:
# read in dictionary for mapping keywords to skills
course = pd.read_csv('./data/courses_textrank_labels.csv')
course['text'] = course['title'] + ' ' + course['description']
label_to_skill = dict(zip(course['label'], course['skill']))

In [3]:
# load trained NER model
model_dir='./models/train_textrank_labels'
nlp = spacy.load(model_dir)

In [4]:
# apply trained NER
def apply_model(test_text):
    skills = []
    doc = nlp(test_text)
    for ent in doc.ents:
        if label_to_skill.get(ent.text):
            skills.append(label_to_skill.get(ent.text))
        else:
            skills.append(ent.text)
    return skills

In [5]:
course['predicted'] = course['text'].map(apply_model)
# combine multiple skills to single string
course['predicted_str'] = course['predicted'].apply(lambda x: ' '.join(x))

In [6]:
# prepare for preprocessing
punctuations = string.punctuation
stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')

In [7]:
# tokenize the strings
def tokenize(doc):
    doc = nlp(doc, disable=['parser', 'ner'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stopwords and tok not in punctuations]
    tokens = [tok.translate(str.maketrans('', '', string.punctuation)) for tok in tokens] # remove remaining punctuations
    tokens = [''.join([i for i in tok if not i.isdigit()]) for tok in tokens]
    return tokens

In [8]:
course['predicted_tok'] = course['predicted_str'].map(tokenize)
course['skill_tok'] = course['skill'].map(tokenize)

In [9]:
# load pretrained word2vec embedding
model = gensim.models.KeyedVectors.load_word2vec_format('lexvec.enwiki+newscrawl.300d.W.pos.vectors.gz')

In [10]:
# vectorize by word2vec, adding up vectors of each word to get synthetic vector for each skill  
def vectorize(list):
    vec_tot = empty([300,])
    for tok in list:
        try:
            vec = model.get_vector(tok)
            vec_tot += vec
        except (KeyError):
            continue
    return vec_tot

In [11]:
course['predicted_vec'] = course['predicted_tok'].apply(lambda x: vectorize(x))
course['skill_vec'] = course['skill_tok'].apply(lambda x: vectorize(x))
course['predicted_vec'] = course['predicted_vec'].apply(lambda x: np.nan_to_num(x))
course['skill_vec'] = course['skill_vec'].apply(lambda x: np.nan_to_num(x))

In [12]:
# calculate cosine similarity between predicted skill and true skill tag
course['cos_sim'] = course.apply(lambda row: cosine_similarity([row['predicted_vec']], [row['skill_vec']]), axis=1)

In [13]:
course['cos_sim'].mean()

array([[0.90272676]])

In [14]:
course['cos_sim'].std()

0.21077153846421604