In [1]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm") #Creating nlp object
import flair
from flair.data import Sentence
from flair.embeddings import WordEmbeddings
import torch
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\makye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\makye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\makye\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


#### Steps NLP

1.Tokenization

2.Embedding

## 1. Tokenize 

In [2]:
#. Tokenize this document with SpaCy:
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")

### Reading text/tokens with spacy

In [3]:
#Reading text with spacy
docx = nlp(text)
docx

When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously. “I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.

### Toknize text with Spacy

In [4]:
#Number sentecnes in text
for num, sentence in enumerate(docx.sents):
    print(f'{num}: {sentence}')
    

0: When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.
1: “
2: I can tell you very senior CEOs of major American car companies would shake my hand and turn away because I wasn’t worth talking to,” said Thrun, in an interview with Recode earlier this week.


In [6]:
#Number of tokens in text
for num, token in enumerate(docx):
    print(f'{num}: {token.text}')

0: When
1: Sebastian
2: Thrun
3: started
4: working
5: on
6: self
7: -
8: driving
9: cars
10: at
11: Google
12: in
13: 2007
14: ,
15: few
16: people
17: outside
18: of
19: the
20: company
21: took
22: him
23: seriously
24: .
25: “
26: I
27: can
28: tell
29: you
30: very
31: senior
32: CEOs
33: of
34: major
35: American
36: car
37: companies
38: would
39: shake
40: my
41: hand
42: and
43: turn
44: away
45: because
46: I
47: was
48: n’t
49: worth
50: talking
51: to
52: ,
53: ”
54: said
55: Thrun
56: ,
57: in
58: an
59: interview
60: with
61: Recode
62: earlier
63: this
64: week
65: .


## 2. Embedding

In [7]:
#Embedding with spacy
sentence = "I love learning"

# Assigning vectors to words
doc = nlp(sentence)

#Checking vector for 2nd word in sentence: 'Love'
doc[1].vector

array([ 1.03492558e-01, -9.99394298e-01,  3.91286612e-02, -3.05882883e+00,
        4.75878239e+00, -8.13433647e-01, -2.48094654e+00, -3.05946755e+00,
        5.27173340e-01,  1.50878453e+00,  3.23129702e+00, -4.56019115e+00,
        4.10485446e-01,  8.85138392e-01, -3.51287150e+00, -2.27991605e+00,
       -4.35688162e+00,  1.83025956e+00, -4.41841698e+00,  3.75293231e+00,
        1.08742666e+00,  1.71857268e-01,  4.50415373e-01,  1.60834622e+00,
        2.08884954e+00, -1.73660016e+00,  2.14878607e+00,  2.64554095e+00,
        2.05439758e+00, -3.08938563e-01, -5.60664773e-01, -1.35718632e+00,
       -9.94471788e-01,  3.80324912e+00, -9.63790059e-01, -3.44084907e+00,
       -1.21351826e+00,  7.76718259e-02, -8.70164156e-01,  1.27800488e+00,
       -1.27331066e+00, -6.25064611e+00,  1.57126951e+00,  3.60463440e-01,
       -1.14918947e-02,  5.19760251e-01, -3.52944160e+00, -2.85665846e+00,
        4.44239378e+00, -2.23635650e+00,  2.76743078e+00, -2.28830814e-01,
       -1.03909874e+00,  

In [8]:
# Mean vector for whole sentence (used in sentence classification)
doc.vector

array([-6.20014250e-01, -5.18343031e-01,  1.65436268e+00, -5.91645658e-01,
        8.27838480e-01, -1.97387397e-01, -1.25076401e+00, -1.85162723e+00,
       -9.24894154e-01, -1.46129802e-01,  3.16667056e+00, -2.42977929e+00,
        1.27589047e+00,  7.44478226e-01, -2.49340653e+00, -5.45810103e-01,
       -2.23134208e+00, -7.43896186e-01, -1.81817368e-01,  1.08086574e+00,
        1.85745490e+00,  7.59144783e-01, -3.52969497e-01,  1.54789579e+00,
       -3.33573431e-01, -2.12107301e+00, -8.94382715e-01, -9.62785911e-03,
        4.64864403e-01, -1.22728503e+00, -5.27982175e-01, -2.25586090e-02,
       -3.81013781e-01,  1.33011246e+00, -7.90472209e-01, -1.61791146e+00,
        3.59495163e-01,  9.92690325e-01, -1.34316728e-01,  4.84985113e-01,
        3.48023921e-01, -2.11745262e+00,  1.95988929e+00, -9.58492279e-01,
       -1.14997303e+00,  9.09446716e-01, -1.77066469e+00,  6.91713572e-01,
        1.52567327e+00, -1.66518497e+00,  2.59987926e+00, -1.26903617e+00,
        5.68261802e-01, -

### Embedding using Flair

- This embeds the sentence with GloVe embeddings, then prints out the embedding of each word. Each word embedding in this case has the shape [100], i.e. a 100 dimensional vector.

- Then we use torch.cat to concatenate all embeddings of all words in the sentence into a tensor of shape [3, 100] (3 words, each 100 dimensions).

- If you want to have a batch of more than one sentence, you need to use torch.cat again to concatenate the tensors of each sentence.

In [9]:
#Tokenaize setence with Flair
s= Sentence("I love learning")
print(s)

Sentence: "I love learning"   [− Tokens: 3]


In [10]:
# load word embeddings
embeddings = WordEmbeddings('glove')

# embed sentences 's'
embeddings.embed(s)

[Sentence: "I love learning"   [− Tokens: 3]]

In [11]:
# go through each token in sentence
for token in s:

# print embedding of this Token
    print(token.embedding)


# print shape of embedding of this Token   
    print(token.embedding.shape)
  

tensor([-0.0465,  0.6197,  0.5665, -0.4658, -1.1890,  0.4460,  0.0660,  0.3191,
         0.1468, -0.2212,  0.7924,  0.2991,  0.1607,  0.0253,  0.1868, -0.3100,
        -0.2811,  0.6051, -1.0654,  0.5248,  0.0642,  1.0358, -0.4078, -0.3801,
         0.3080,  0.5996, -0.2699, -0.7603,  0.9422, -0.4692, -0.1828,  0.9065,
         0.7967,  0.2482,  0.2571,  0.6232, -0.4477,  0.6536,  0.7690, -0.5123,
        -0.4433, -0.2187,  0.3837, -1.1483, -0.9440, -0.1506,  0.3001, -0.5781,
         0.2017, -1.6591, -0.0792,  0.0264,  0.2205,  0.9971, -0.5754, -2.7266,
         0.3145,  0.7052,  1.4381,  0.9913,  0.1398,  1.3474, -1.1753,  0.0040,
         1.0298,  0.0646,  0.9089,  0.8287, -0.4700, -0.1058,  0.5916, -0.4221,
         0.5733, -0.5411,  0.1077,  0.3978, -0.0487,  0.0646, -0.6144, -0.2860,
         0.5067, -0.4976, -0.8157,  0.1641, -1.9630, -0.2669, -0.3759, -0.9585,
        -0.8584, -0.7158, -0.3234, -0.4312,  0.4139,  0.2837, -0.7093,  0.1500,
        -0.2154, -0.3762, -0.0325,  0.80

In [12]:
# print shape of embedding of this Token
for token in s:
    print(token.embedding.shape)

torch.Size([100])
torch.Size([100])
torch.Size([100])


In [17]:
'''
A tensor is a multidimension array of a 
tokenaized(broken down sentence in individual words) sentence
'''

# make one tensor(multi-dimension array) of all word embeddings of a sentence 's'
sentence_tensor = torch.cat([token.embedding.unsqueeze(0) for token in s], dim=0)

In [15]:
'''
output is 3 rows representing the 3 words in the sentence
&
Each word has 100 columns (which is the size of @column)
'''

# print tensor(multi-dimension array) shape 
print(sentence_tensor.shape) 


torch.Size([3, 100])


## Lemmatization/ finding Pos tags for @ token

In [5]:
# sort so as to group together inflected or variant forms(isomers) of the same word.


'''
e.g of lemmatizing
walking   when lemmatized will result into walk
walked

steps or lemmatizing sentences

1. find POS (part ofspeech) tag for @ token(word) with nltk
2. then find corresponding wordnet tag for nltk pos tag
3. then lemmatize the tokens using the wordnet tags

'''

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'): #Adjective, describes noun, e.g red dress
        return wordnet.ADJ
    elif nltk_tag.startswith('V'): #verb = doing word e.g run
        return wordnet.VERB
    elif nltk_tag.startswith('N'): # noun, name of persons, things, places e.g Uganda, Martin etc
        return wordnet.NOUN
    elif nltk_tag.startswith('R'): #Adverb qualifies a verb, e.g running "quickly"
        return wordnet.ADV
    else:          
        return None

In [6]:

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

print(lemmatizer.lemmatize("I am loving it")) #I am loving it
print(lemmatizer.lemmatize("loving")) #loving
print(lemmatizer.lemmatize("loving", "v")) #love
print(lemmatize_sentence("I am loving it")) #I be love it

I am loving it
loving
love
I be love it


#### Lemmatizing with nltk and wordnet POS tags

In [13]:
my_sentence = "Those children are playing. this game, those games, I play he plays"

#Lemmatize a sentence using the nltk POS tag 
print(lemmatizer.lemmatize(my_sentence))


#Lemmatize a sentence using the above fucntion with wordnet POS tag 
print(lemmatize_sentence(my_sentence)) 

Those children are playing. this game, those games, I play he plays
Those child be play . this game , those game , I play he play


In [12]:
#Example of lemmatizing a single word(token)
print(lemmatize_sentence("I am loving it"))

rocks : rock
corpora : corpus


## Chunking

In [7]:
text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus."""

# Preprocess the text
doc = nlp(text)
# Create a list of sentence
sentence_spans = list(doc.sents)
# Display SpaCy vizualizer for each sentence
displacy.render(sentence_spans, style="dep")

TypeError: __init__() got an unexpected keyword argument 'encoding'