#### Install spaCy

In [None]:
!pip install -U spacy --quiet

In [None]:
# Download the large English model for spaCy
!sudo python3 -m spacy download en_core_web_lg
!python -m  spacy link --force en_core_web_lg en

#### Removing Personal information

In [None]:
import spacy

In [None]:
# Load the large English NLP model
nlp = spacy.load('en')

In [None]:
#Example text
text1 = 'My name is Rajeev.'

In [None]:
#Parse the text using spaCy
doc = nlp(text1)

In [None]:
#Check Named entity Recognition
for token in doc:
    print(token, token.ent_type_)

Function to remove name

In [None]:
def remove_name(text, replacement_token='[NAME]'):

    #Parse the text
    doc = nlp(text)

    #Updated document
    updated_doc = []

    #Check Entity type
    for token in doc:
        if token.ent_type_ == 'PERSON':
            updated_doc.append(replacement_token)
        else:
            updated_doc.append(token.string)
    
    return ''.join(updated_doc)

In [None]:
remove_name(text1)

In [None]:
remove_name('My name is Rajeev Kumar.')

##### How do we correct that?

In [None]:
#Lets check how many entities do we have?
doc = nlp('My name is Rajeev Kumar.')
i = 1
for ent in doc.ents:
    print(str(i) + '.',ent)
    i += 1

In [None]:
#How do we check if a token i.e word is beginning of a multiple words entity
for token in doc:
    #Lets check IOB value
    print(token, token.ent_iob_)

In [None]:
def remove_name2(text, replacement_token='[NAME]'):

    #Parse the text
    doc = nlp(text)

    #Updated document
    updated_doc = []

    #Check Entity type
    for token in doc:
        if token.ent_type_ == 'PERSON':
            if token.ent_iob_ == 'B':
                #Replace starting entity word
                updated_doc.append(replacement_token)
            else:
                #ignore 
                pass
        else:
            updated_doc.append(token.string)
    
    return ''.join(updated_doc)

In [None]:
remove_name2('My name is Rajeev Kumar.')

In [None]:
text = """Bhuvneshwar Kumar still hasn’t found the right words that would explain how 
he felt having dismissed Sachin Tendulkar for the batting legend’s first ever duck in first-class cricket.
The moment came in 2009, when representing Uttar Pradesh as a 19-year-old upcoming fast bowler, 
Bhuvneshwar bowled a cutter that took an inside edge from the bat of Tendulkar 
only to nestle into the safe hands of the fielder.
"""

print(remove_name2(text))

#### Similarity

Word Similarity

In [None]:
doc = nlp('car bus computer laptop')

for word1 in doc:
    for word2 in doc:
        print(word1, word2, word1.similarity(word2))

Sentence Similarity

In [None]:
#Two similar sentences
doc1 = nlp('great movie')
doc2 = nlp('awesome film')

print(doc1.similarity(doc2))

In [None]:
#Different sentence from above
doc3 = nlp('keep learning')

doc1.similarity(doc3)

In [None]:
doc = nlp('Hyderabad is the captial of Telangana.')

#Print POS tags for each word
for word in doc:
    print(word, word.pos_)

In [None]:
#Visualize dependecy parsing 
from spacy import displacy

In [None]:
#For 1st sentence
displacy.render(doc, style="dep", jupyter=True)