In [None]:
import spacy
from spacy.language import Language


In [2]:
nlp = spacy.load('en_core_web_lg',disable = ['ner', 'parser'])
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(nlp.pipe_names)

['tok2vec', 'tagger', 'attribute_ruler', 'lemmatizer']


In [3]:
doc = """Supervised learning is the machine learning task of 
         learning a function that maps an input to an output based 
         on example input-output pairs [1] It infers a function 
         from labeled training data consisting of a set of 
         training examples [2] In supervised learning, each 
         example is a pair consisting of an input object 
         (typically a vector) and a desired output value (also 
         called the supervisory signal). A supervised learning 
         algorithm analyzes the training data and produces an 
         inferred function, which can be used for mapping new 
         examples. An optimal scenario will allow for the algorithm 
         to correctly determine the class labels for unseen 
         instances. This requires the learning algorithm to  
         generalize from the training data to unseen situations 
         in a 'reasonable' way (see inductive bias)?
      """

In [4]:
doc

"Supervised learning is the machine learning task of \n         learning a function that maps an input to an output based \n         on example input-output pairs [1] It infers a function \n         from labeled training data consisting of a set of \n         training examples [2] In supervised learning, each \n         example is a pair consisting of an input object \n         (typically a vector) and a desired output value (also \n         called the supervisory signal). A supervised learning \n         algorithm analyzes the training data and produces an \n         inferred function, which can be used for mapping new \n         examples. An optimal scenario will allow for the algorithm \n         to correctly determine the class labels for unseen \n         instances. This requires the learning algorithm to  \n         generalize from the training data to unseen situations \n         in a 'reasonable' way (see inductive bias)?\n      "

In [5]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', '', text.lower())
    return text


tokens = nlp(doc)
lemma_list = []
for token in tokens:
    if token.is_stop is False:
        token_preprocessed = preprocessor(token.lemma_)
        if token_preprocessed != '':
             lemma_list.append(nlp(token_preprocessed))
lemma_list
        
        

[supervised,
 learning,
 machine,
 learn,
 task,
 learn,
 function,
 map,
 input,
 output,
 base,
 example,
 input,
 output,
 pair,
 1,
 infer,
 function,
 label,
 training,
 datum,
 consist,
 set,
 training,
 example,
 2,
 supervised,
 learning,
 example,
 pair,
 consist,
 input,
 object,
 typically,
 vector,
 desire,
 output,
 value,
 call,
 supervisory,
 signal,
 supervised,
 learn,
 algorithm,
 analyze,
 training,
 datum,
 produce,
 infer,
 function,
 map,
 new,
 example,
 optimal,
 scenario,
 allow,
 algorithm,
 correctly,
 determine,
 class,
 label,
 unseen,
 instance,
 require,
 learning,
 algorithm,
 generalize,
 training,
 datum,
 unseen,
 situation,
 reasonable,
 way,
 inductive,
 bias]

### Trying with spacy models

In [9]:
key = nlp("supervised learning")

for i in lemma_list:
    s = key.similarity(i)
    
    if s > 0.5:
        print("Key:{} \nWord Found:{} \nSimilarity score:{}".format(key,i,s))
        print("----------------------------------------------------")  


Key:supervised learning 
Word Found:supervised 
Similarity score:0.8173653460799903
----------------------------------------------------
Key:supervised learning 
Word Found:learning 
Similarity score:0.8271284631559636
----------------------------------------------------
Key:supervised learning 
Word Found:learn 
Similarity score:0.5889739049717068
----------------------------------------------------
Key:supervised learning 
Word Found:learn 
Similarity score:0.5889739049717068
----------------------------------------------------
Key:supervised learning 
Word Found:training 
Similarity score:0.6670860332021503
----------------------------------------------------
Key:supervised learning 
Word Found:training 
Similarity score:0.6670860332021503
----------------------------------------------------
Key:supervised learning 
Word Found:supervised 
Similarity score:0.8173653460799903
----------------------------------------------------
Key:supervised learning 
Word Found:learning 
Similarity 

### Trying with transformer models 

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [11]:
model = SentenceTransformer('stsb-roberta-large')

Downloading:   0%|          | 0.00/748 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [13]:
sentence1 = "Word"
sentence2 = "Word"
# encode sentences to get their embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)
# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Similarity score:", cosine_scores.item())

Sentence 1: Word
Sentence 2: Word
Similarity score: 1.0000007152557373


In [None]:
key = "learn"

embedding1 = model.encode(key, convert_to_tensor=True)

for i in lemma_list:
    i = i.text
     
    embedding2 = model.encode(i, convert_to_tensor=True)
    # compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    
    if cosine_scores > 0.5:
        print("Key:{} \nWord Found:{} \nSimilarity score:{}".format(key,i,cosine_scores[0]))
        print("----------------------------------------------------")  

Key:learn 
Word Found:learning 
Similarity score:tensor([[0.9376]])
----------------------------------------------------
Key:learn 
Word Found:learn 
Similarity score:tensor([[1.0000]])
----------------------------------------------------
Key:learn 
Word Found:learn 
Similarity score:tensor([[1.0000]])
----------------------------------------------------
