In this and previous notebooks (04_TFIDF_and...) we tried tf-idf, count-vectorizer and SciBERT models to identify named entities or concepts in Mathematics for Machine Learning Textbook. 

## 1. ScispaCy

In [None]:
# 1. Create an environment for scispacy
#conda create -n scispacy python=3.9

# 2. Load a model trained on scientific publications
# #! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
# ! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz 
! pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz
# NOTE: scibert doesn't have vectors

# 3. Install spacy
#! pip install spacy


#! pip install pandas
import pandas as pd
import spacy

In [2]:
data = pd.read_json("../dat/parsed_books/mml.json")
data

Unnamed: 0,text
Analytic Geometry,"In Chapter 2, we studied vectors, vector space..."
Continuous Optimization,Since machine learning algorithms are implemen...
Introduction and Motivation,Machine learning is about designing algorithms...
Linear Algebra,"When formalizing intuitive concepts, a common ..."
Matrix Decompositions,"In Chapters 2 and 3, we studied ways to manipu..."
Probability and Distributions,"Probability, loosely speaking, concerns the st..."
Vector Calculus,Many algorithms in machine learning optimize a...


In [104]:
nlp = spacy.load("en_core_sci_md")
doc=nlp(data.text["Linear Algebra"])
entities_la = doc.ents

In [105]:
df_entities = pd.DataFrame(columns = ["entity","lemma","start_char","end_char","position in text","embedding"])
for n, entity in enumerate(entities_la):
    df_entities.loc[n,:] = [entity, entity.lemma_.lower(), entity.start_char, entity.end_char, entity.start, entity.vector]
df_entities.head(5)

Unnamed: 0,entity,lemma,start_char,end_char,position in text,embedding
0,(intuitive),intuitive,17,26,2,"[-0.271802, 0.0637784, 0.0866259, -0.21584, -0..."
1,(concepts),concept,27,35,3,"[-0.22106, -0.00857793, -0.0511574, -0.147617,..."
2,(objects),object,80,87,15,"[0.166611, 0.103413, 0.0951793, -0.321065, 0.0..."
3,(symbols),symbol,89,96,17,"[0.218379, 0.264318, 0.223982, -0.680185, 0.36..."
4,(rules),rule,111,116,23,"[-0.046738, -0.0100906, 0.0975509, -0.171749, ..."


In [106]:
df_entities.loc[df_entities.lemma == "vector", "embedding"] # shape of embedding is 200

9       [0.199261, -0.0951729, 0.291055, -0.121187, -0...
10      [0.199261, -0.0951729, 0.291055, -0.121187, -0...
11      [0.199261, -0.0951729, 0.291055, -0.121187, -0...
17      [0.199261, -0.0951729, 0.291055, -0.121187, -0...
20      [0.199261, -0.0951729, 0.291055, -0.121187, -0...
                              ...                        
2417    [0.400419, -0.0290379, 0.289465, -0.116615, -0...
2420    [0.199261, -0.0951729, 0.291055, -0.121187, -0...
2459    [0.199261, -0.0951729, 0.291055, -0.121187, -0...
2463    [0.199261, -0.0951729, 0.291055, -0.121187, -0...
2482    [0.400419, -0.0290379, 0.289465, -0.116615, -0...
Name: embedding, Length: 148, dtype: object

In [108]:
from collections import defaultdict
freqs = defaultdict(int)
for w in df_entities.lemma:
    freqs[w] += 1

freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse = True))
freqs

{'vector': 148,
 'matrix': 78,
 'solution': 53,
 'transformation': 39,
 'system': 37,
 'vector space': 35,
 'linear equation': 32,
 'column': 31,
 'subspace': 28,
 'deﬁne': 27,
 'mapping': 27,
 'inverse': 25,
 'linearly': 24,
 'coordinate': 24,
 'basis': 22,
 'row-echelon': 20,
 'deﬁnition': 18,
 'property': 17,
 'linear combination': 17,
 'group': 17,
 'independent': 17,
 'operation': 16,
 'space': 16,
 'variable': 16,
 'dimension': 16,
 'reduce': 15,
 'concept': 14,
 'multiplication': 14,
 'linear mapping': 14,
 'representation': 14,
 'section': 13,
 'kernel': 13,
 'scalar': 12,
 'geometric vector': 11,
 'equation': 11,
 'pivot column': 11,
 'image': 11,
 'element': 10,
 'plane': 10,
 'pivot': 10,
 'afﬁne subspace': 10,
 'matrix multiplication': 9,
 'general solution': 9,
 'object': 8,
 'polynomial': 8,
 'rn': 8,
 'row': 8,
 'gaussian': 8,
 'vector subspace': 8,
 'linear mapping \x08': 8,
 'case': 7,
 'unique': 7,
 'elimination': 7,
 'elementary': 7,
 'express': 7,
 'rank': 7,
 'neut

In [126]:
# calculating similarity
import numpy as np
from numpy.linalg import norm
def find_most_similar(word, df_entities):
    try:
        word_embedding = df_entities.loc[df_entities.lemma == word,"embedding"].mean()
        rest = (df_entities.loc[df_entities.lemma != word,["lemma","embedding"]]
                .groupby("lemma").mean()
                .reset_index()
            )
        rest_embeddings = rest["embedding"].values
        rest_entities = rest["lemma"].values

        cosine_similarity = []
        for embedding in rest_embeddings:
            cosine_similarity.append(embedding @ word_embedding / (norm(embedding)*norm(word_embedding)+10e-10))
        best = np.array(cosine_similarity).argsort()[-5:][::-1]
        return rest_entities[best]
    except:
        print("This word is not included in the dataset")
    

In [127]:
find_most_similar("matrix", df_entities)

array(['matrix a\x08with', 'matrix form',
       'r3!r3whose transformation matrix',
       'r3!r4whose transformation matrix', 'full-rank matrix'],
      dtype=object)

In [128]:
find_most_similar("vector", df_entities)

array(['rn\x021andr1\x02n(therow vector', 'vector vector',
       'ﬁnite-dimensional vector',
       '.finite-dimensional vector\nspacesvandware',
       '(v;+)is vector\nthe zero vector'], dtype=object)

In [142]:
find_most_similar("determinant", df_entities)

array(['consequence', 'context', 'function', 'essential',
       'principal component analysis'], dtype=object)

In [143]:
find_most_similar("inverse", df_entities)

array(['inverse element', 'linear\nalgebra', 'linear\nmapping',
       'linear spaces/mapping', 'linear equa-\ntionsa\x15'], dtype=object)

#### Conclusion: 

1. NER: en_core_sci_md model identifies entities quite well, even if it was trained on biomedical data. However, some subsequent filtering is required.

2. Cosine similarity between word embeddings does not allow finding most similar concepts supposedly for the following reasons:

- corpus is small

- noise in data like '.finite-dimensional vector\nspacesvandware' and etc.

- need to check "the meaning" of vectors, maybe I misused them

- averages of the vectors may not represent a "real" vector/embedding of a word in the vector space.

## SciBert, only transformers library

In [42]:
from transformers import BertTokenizer, BertForTokenClassification, pipeline

tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertForTokenClassification.from_pretrained('allenai/scibert_scivocab_uncased')
nlp = pipeline('ner', model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initi

In [44]:
doc = nlp(data.text["Linear Algebra"][:1000])

In [47]:
doc[:5]

[{'entity': 'LABEL_1',
  'score': 0.5618813,
  'index': 1,
  'word': 'when',
  'start': None,
  'end': None},
 {'entity': 'LABEL_0',
  'score': 0.6189762,
  'index': 2,
  'word': 'formal',
  'start': None,
  'end': None},
 {'entity': 'LABEL_0',
  'score': 0.5179148,
  'index': 3,
  'word': '##izing',
  'start': None,
  'end': None},
 {'entity': 'LABEL_0',
  'score': 0.55187863,
  'index': 4,
  'word': 'intuitive',
  'start': None,
  'end': None},
 {'entity': 'LABEL_0',
  'score': 0.52490836,
  'index': 5,
  'word': 'concepts',
  'start': None,
  'end': None}]