# General

In [1]:
import os
from google.colab import drive

In [2]:
# Mount the drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
INPUT_FOLDER = '/content/gdrive/Shareddrives/CSE 144 DATA/DATA'

In [4]:
sentence1 = 'This is good idsoakfjlsakd jflksaj dfklj asdlkf'
sentence2 = 'This is good iwrejodiasjkzl asdf '

# Models

## GloVe

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

In [6]:
import spacy

In [7]:
glove_model = spacy.load('en_core_web_md')

In [8]:
def word_embedding_glove(sentence):
  return glove_model(sentence).vector

In [78]:
glove_embedding1 = word_embedding_glove(sentence1)
glove_embedding2 = word_embedding_glove(sentence2)

## FastText

In [None]:
!pip install fasttext
!pip install numpy
!pip install gensim

In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import fasttext

In [12]:
# Load FastText word embeddings model for the English language
fasttext_model = fasttext.load_model(INPUT_FOLDER+'/cc.en.300.bin')



In [13]:
!pip install numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
import numpy as np

def word_embedding_fasttext(sentence):
    words = sentence.lower().split()
    embeddings = [fasttext_model.get_word_vector(word) for word in words]
    return np.mean(embeddings, axis=0)

In [15]:
fasttext_embedding1 = word_embedding_fasttext(sentence1)
fasttext_embedding2 = word_embedding_fasttext(sentence2)

## ELMo

In [None]:
!pip install tensorflow
!pip install tensorflow_hub
!pip install numpy
!pip install sklearn

In [64]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [65]:
elmo_model = hub.load("https://tfhub.dev/google/elmo/3")
tf.compat.v1.enable_eager_execution()

In [66]:
def word_embedding_elmo(sentence):
    embeddings = elmo_model.signatures["default"](tf.constant([sentence]))["default"]
    return embeddings.numpy().squeeze()

In [67]:
elmo_embedding1 = word_embedding_elmo(sentence1)
elmo_embedding2 = word_embedding_elmo(sentence2)

## BERT

In [None]:
!pip install transformers 
!pip install torch

In [None]:
from transformers import BertModel, BertTokenizer

# Load BERT model and tokenizer
model_name = 'bert-large-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = BertModel.from_pretrained(model_name)

In [42]:
import torch

def word_embedding_bert(sentence):
  tokens = tokenizer.encode(sentence, add_special_tokens=True, max_length=1024, truncation=True)
  ids = torch.tensor(tokens).unsqueeze(0)
  with torch.no_grad():
    return bert_model(ids).last_hidden_state.mean(dim=1).squeeze()

In [44]:
bert_embedding1 = word_embedding_bert(sentence1)
bert_embedding2 = word_embedding_bert(sentence2)

## GPT

In [None]:
!pip install transformers
!pip install torch

In [85]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

In [86]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
gpt_model = GPT2Model.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

1

In [87]:
def word_embedding_gpt(sentence):
    encoded = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
      outputs = gpt_model(**encoded)
      return outputs.last_hidden_state.mean(dim=1).numpy()

In [88]:
gpt_embedding1 = word_embedding_gpt(sentence1)
gpt_embedding2 = word_embedding_gpt(sentence2)

# Similarity Scores

In [22]:
models = ['word2vec', 'fasttext', 'glove', 'elmo', 'bert', 'gpt']

## Cosine Similarity

In [None]:
!pip install sklearn

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

# [-1, 1]. higher value -> higher similarity
def get_cosine_similarity(job_embedding, resume_embedding, model):
  
  if model not in models:
    raise Exception('Model Not Found!')

  if model in ['fasttext', 'glove', 'bert']:
    return cosine_similarity(job_embedding.reshape(1, -1), resume_embedding.reshape(1, -1))[0, 0]
    
  elif model == 'elmo':
    return cosine_similarity([job_embedding], [resume_embedding])[0][0]

  # GPT model
  else:
    return cosine_similarity(job_embedding, resume_embedding)[0][0]

In [90]:
fasttext_cos = get_cosine_similarity(fasttext_embedding1, fasttext_embedding2, 'fasttext')
glove_cos = get_cosine_similarity(glove_embedding1, glove_embedding2, 'glove')
bert_cos = get_cosine_similarity(bert_embedding1, bert_embedding2, 'bert')
gpt_cos = get_cosine_similarity(gpt_embedding1, gpt_embedding2, 'gpt')
elmo_cos = get_cosine_similarity(elmo_embedding1, elmo_embedding2, 'elmo')

In [91]:
print('Fasttext Cosine Similarity   :  ', fasttext_cos)
print('Glove    Cosine Similarity   :  ', glove_cos)
print('Bert     Cosine Similarity   :  ', bert_cos)
print('GPT      Cosine Similarity   :  ', gpt_cos)
print('Elmo      Cosine Similarity  :  ', elmo_cos)

Fasttext Cosine Similarity   :   0.89969605
Glove    Cosine Similarity   :   0.9999999
Bert     Cosine Similarity   :   0.85077584
GPT      Cosine Similarity   :   0.9991702
Elmo      Cosine Similarity  :   0.8783331


## Eucledian Distance

In [37]:
!pip install sklearn

In [54]:
from scipy.spatial.distance import euclidean
from sklearn.metrics.pairwise import euclidean_distances

# non-negative value, lower the value -> higher the similarity
def get_eucledian_distance(job_embedding, resume_embedding, model):
  
  if model not in models:
    raise Exception('Model Not Found!')

  if model in ['fasttext', 'elmo']:
    return euclidean(job_embedding, resume_embedding)
  
  # GloVe, BERT, or GPT
  else:
    return np.linalg.norm(job_embedding - resume_embedding)

In [92]:
fasttext_euc = get_eucledian_distance(fasttext_embedding1, fasttext_embedding2, 'fasttext')
glove_euc = get_eucledian_distance(glove_embedding1, glove_embedding2, 'glove')
bert_euc = get_eucledian_distance(bert_embedding1, bert_embedding2, 'bert')
gpt_euc = get_eucledian_distance(gpt_embedding1, gpt_embedding2, 'gpt')
elmo_euc = get_eucledian_distance(elmo_embedding1, elmo_embedding2, 'elmo')

In [93]:
print('Fasttext Eucledian Distance   :  ', fasttext_euc)
print('Glove    Eucledian Distance   :  ', glove_euc)
print('Bert      Eucledian Distance  :  ', bert_euc)
print('GPT      Eucledian Distance   :  ', gpt_euc)
print('Elmo      Eucledian Distance  :  ', elmo_euc)

Fasttext Eucledian Distance   :   0.4699311852455139
Glove    Eucledian Distance   :   9.389667
Bert      Eucledian Distance  :   5.4965577
GPT      Eucledian Distance   :   12.943077
Elmo      Eucledian Distance  :   4.928329944610596


## Pearson Correlation Coefficient

In [None]:
!pip install scipy

In [57]:
from scipy.stats import pearsonr

# -1 to 1. -1 perfect negative linear relationship, 0 none, 1 perfect positive
def get_pearson_coefficient(job_embedding, resume_embedding, model):
  
  if model not in models:
    raise Exception('Model Not Found!')

  if model == 'gpt':
    coeff, _ = pearsonr(job_embedding[0], resume_embedding[0])
    return coeff
  
  else:
    coeff, _ = pearsonr(job_embedding, resume_embedding)
    return coeff
    

In [94]:
fasttext_pear = get_pearson_coefficient(fasttext_embedding1, fasttext_embedding2, 'fasttext')
glove_pear = get_pearson_coefficient(glove_embedding1, glove_embedding2, 'glove')
bert_pear = get_pearson_coefficient(bert_embedding1, bert_embedding2, 'bert')
gpt_pear = get_pearson_coefficient(gpt_embedding1, gpt_embedding2, 'gpt')
elmo_pear = get_pearson_coefficient(elmo_embedding1, elmo_embedding2, 'elmo')

In [95]:
print('Fasttext Pearson Coefficient   :  ', fasttext_pear)
print('Glove    Pearson Coefficient   :  ', glove_pear)
print('Bert      Pearson Coefficient  :  ', bert_pear)
print('GPT       Pearson Coefficient  :  ', gpt_pear)
print('Elmo      Pearson Coefficient  :  ', elmo_pear)

Fasttext Pearson Coefficient   :   0.8994260778814983
Glove    Pearson Coefficient   :   0.9999999999999994
Bert      Pearson Coefficient  :   0.8507406101920222
GPT       Pearson Coefficient  :   0.9991680796707327
Elmo      Pearson Coefficient  :   0.8771596351549465


## Manhattan Distance

In [None]:
!pip install scipy

In [60]:
from scipy.spatial.distance import cityblock

max_length = 10000

# non-negative. higher value -> less similar
def get_manhattan_distance(job_embedding, resume_embedding, model):
  
  if model not in models:
    raise Exception('Model Not Found!')

  if model in ['fasttext', 'glove', 'elmo']:
    return cityblock(job_embedding, resume_embedding)

  # BERT or GPT
  else:
    job_embedding = np.pad(job_embedding, (0, max_length - len(job_embedding)))
    resume_embedding = np.pad(resume_embedding, (0, max_length - len(resume_embedding)))
    return np.linalg.norm(job_embedding - resume_embedding, ord=1)

In [None]:
fasttext_manh = get_manhattan_distance(fasttext_embedding1, fasttext_embedding2, 'fasttext')
glove_manh = get_manhattan_distance(glove_embedding1, glove_embedding2, 'glove')
bert_manh = get_manhattan_distance(bert_embedding1, bert_embedding2, 'bert')
gpt_manh = get_manhattan_distance(gpt_embedding1, gpt_embedding2, 'gpt')
elmo_manh = get_manhattan_distance(elmo_embedding1, elmo_embedding2, 'elmo')

In [96]:
print('Fasttext Manhattan Distance    :  ', fasttext_manh)
print('Glove    Manhattan Distance    :  ', glove_manh)
print('Bert     Manhattan Distance    :  ', bert_manh)
print('GPT      Manhattan Distance    :  ', gpt_manh)
print('Elmo     Manhattan Distance    :  ', elmo_manh)

Fasttext Manhattan Distance    :   6.1201916
Glove    Manhattan Distance    :   128.31859
Bert     Manhattan Distance    :   140.0769


NameError: ignored

# TF-IDF

In [None]:
!pip install sklearn

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TFIDF score for a term in a document indicates how important or relevant 
# that term is to the specific document, considering its frequency in the 
# document and its rarity in the corpus.
def get_tfidf(resume_description_list):
  
  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform(resume_description_list)
  feature_names = vectorizer.get_feature_names_out()

  # Print the TF-IDF scores for each term in each document
  for doc_idx, doc in enumerate(resume_description_list):
      feature_index = tfidf_matrix[doc_idx].nonzero()[1]
      tfidf_scores = zip(feature_index, [tfidf_matrix[doc_idx, x] for x in feature_index])
      for term_idx, score in tfidf_scores:
          print(f"Term: {feature_names[term_idx]}, TF-IDF score: {score:.4f}, Document: {doc_idx+1}")


In [39]:
resume_list = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]
get_tfidf(resume_list)

# MESS WITH IT MORE LATER

Term: document, TF-IDF score: 0.4698, Document: 1
Term: first, TF-IDF score: 0.5803, Document: 1
Term: the, TF-IDF score: 0.3841, Document: 1
Term: is, TF-IDF score: 0.3841, Document: 1
Term: this, TF-IDF score: 0.3841, Document: 1
Term: second, TF-IDF score: 0.5386, Document: 2
Term: document, TF-IDF score: 0.6876, Document: 2
Term: the, TF-IDF score: 0.2811, Document: 2
Term: is, TF-IDF score: 0.2811, Document: 2
Term: this, TF-IDF score: 0.2811, Document: 2
Term: one, TF-IDF score: 0.5118, Document: 3
Term: third, TF-IDF score: 0.5118, Document: 3
Term: and, TF-IDF score: 0.5118, Document: 3
Term: the, TF-IDF score: 0.2671, Document: 3
Term: is, TF-IDF score: 0.2671, Document: 3
Term: this, TF-IDF score: 0.2671, Document: 3
Term: document, TF-IDF score: 0.4698, Document: 4
Term: first, TF-IDF score: 0.5803, Document: 4
Term: the, TF-IDF score: 0.3841, Document: 4
Term: is, TF-IDF score: 0.3841, Document: 4
Term: this, TF-IDF score: 0.3841, Document: 4
