In [1]:
import spacy

# Load the model
nlp = spacy.load("en_core_sci_md")



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [2]:
text: str = "Alterations in the hypocretin receptor 2 and preprohypocretin genes produce narcolepsy in people."
# add multiple NLP based attributes and methods to the text
doc = nlp(text)

In [3]:
# For example we now have the options to
# identify if a words is a stopword or not.
# use spacy's lemmatizer, get the Part of Speech of the word.
# as well as any dependency
for token in doc:
    if not token.is_stop:
        print(
            token.text, "->", token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_
        )

Alterations -> alteration NOUN NNS nsubj Xxxxx
hypocretin -> hypocretin NOUN NN compound xxxx
receptor -> receptor NOUN NN nmod xxxx
2 -> 2 NUM CD nummod d
preprohypocretin -> preprohypocretin NOUN NN conj xxxx
genes -> gene NOUN NNS dep xxxx
produce -> produce VERB VBP ROOT xxxx
narcolepsy -> narcolepsy NOUN NN dobj xxxx
people -> people NOUN NNS nmod xxxx
. -> . PUNCT . punct .


In [4]:
spacy.displacy.render(next(doc.sents), style='dep', jupyter=True)

## Word and text similarity

Text similarity has been primordial in NLP since its beginning and even with the great advancements in GenAI, the methods used to semantically compare words and sentences mainly rely in vector operations and "distance" metrics. For example, we know that `shark` and `whale` are more closely related to each other than `shark` and `computer`. Likewise, with modern language models (Word2Vec, transformers, etc.) we can mathematically represent this. Usually, using a common distance metric like euclidean distance or cosine similarity. Let's give it a try.

In [5]:
# let's compare words
text1 = "shark"
text2 = "whale"
text3 = "computer"

print(
    "Similarity Shark and Whale:",
    nlp(text1).similarity(nlp(text2))
)
print()
print(
    "Similarity Shark and Computer:",
    nlp(text1).similarity(nlp(text3))
)


Similarity Shark and Whale: 0.6170432650693893

Similarity Shark and Computer: -0.021838123446266868


In [6]:
# We can do the same with senteces (Spacy uses vector avg to compare sentences)
text1 = "Tylenol is used to treat headaches"
text2 = "Ibuprofen is used to alleviate migraines"

nlp(text1).similarity(nlp(text2))

0.8859446651287843

In [7]:
# Get the Vector representation 
word: str = "melanoma"
word_id = nlp.vocab.strings[word]
word_vector = nlp.vocab.vectors[word_id]
print(word_vector)

[-0.171301    0.269838   -0.292422    0.0969225   0.136634    0.0206791
  0.0343716   0.0241368  -0.230707   -0.142368   -0.23499    -0.434152
  0.40165     0.0404615   0.0620943   0.0411679  -0.358663    0.0899282
  0.0299107  -0.0770028  -0.211768   -0.217445   -0.203516   -0.0142091
 -0.0280481  -0.0351675  -0.0528551  -0.0698697   0.242176    0.152152
  0.129039   -0.035838   -0.0370982   0.102662   -0.0889819  -0.297025
  0.103937   -0.379597   -0.192684   -0.155296    0.0952904   0.0775943
 -0.133239    0.569135    0.193671   -0.270392   -0.0330474   0.156962
 -0.258426    0.0191769   0.193796   -0.0620198   0.242538   -0.0606925
 -0.0176757   0.063728    0.420112    0.217779    0.232284   -0.279989
  0.140984   -0.272254   -0.00408003 -0.233878    0.068528    0.351945
  0.210733   -0.198874    0.326534   -0.199128   -0.320544    0.329174
  0.252536    0.00751772 -0.0430318   0.339309    0.242522    0.15602
 -0.224425   -0.0316276   0.0804329   0.61901    -0.288737    0.270565
 -

## Spacy's Pipelines

However, I would say Spacy's greatest feature is its capacity to create a Pipe (pipeline) with multiple transformations. It allows you to set up an elaborate pre-processing pipeline to efficiently clean, tag and analyse your text input. For example: 


In [23]:
import pandas as pd
from time import time  # Medir tiempo de ejecucion


data_path : str = "../data/mtsamples.csv"
df = pd.read_csv(data_path)
# let's pick the first 50 transcriptions as example 
transcriptions = df["transcription"].dropna()[:50]

In [20]:

def cleaning(doc) -> str:
    """Simple cleaning pipeline using Spacy.

    Lemmatize and eliminates stopwords
    Args:
        doc (spacy.tokens.doc.Doc): Document processed by spacy's pipeline
    Returns:
        str: Processed String.
    """
    txt = [
      token.lemma_ for token in doc if not token.is_stop and token.is_alpha
    ]
    return ' '.join(txt)


In [21]:
# Example in a short sentence
text: str = "Alterations in the Hypocretin receptor 2 and preprohypocretin genes produce narcolepsy in people."
cleaning(nlp(text))

'alteration hypocretin receptor preprohypocretin gene produce narcolepsy people'

In [27]:
t = time() # let's measure execution time

txt = [
    cleaning(doc) for doc in nlp.pipe(
        transcriptions,
        batch_size=20,
        n_process=1
    )
]
# medimos tiempo de ejecución 
t_ = round((time() - t) / 60, 2)  # seconds needed to run
print(f'Execution time: {t_} mins')

Execution time: 0.07 mins


In [24]:
t = time() # let's measure execution time

txt = [
    cleaning(nlp(doc)) for doc in transcriptions
]
# medimos tiempo de ejecución 
t_ = round((time() - t) / 60, 2)  # seconds needed to run
print(f'Execution time: {t_} mins')

Execution time: 0.07 mins
