#### Testing spacy and nltk

In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I like to read. I enjoy hiking.")
for sentence in doc.sents:
    print(sentence)
    for word in sentence:
        print(word)

I like to read.
I
like
to
read
.
I enjoy hiking.
I
enjoy
hiking
.


In [4]:
# !python -m spacy download en_core_web_sm

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pinkysitikhu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize

print(sent_tokenize("Dr. Pinky like to read. I enjoy hiking."))
word_tokenize("Dr. Pinky like to read. I enjoy hiking.")

['Dr. Pinky like to read.', 'I enjoy hiking.']


['Dr.', 'Pinky', 'like', 'to', 'read', '.', 'I', 'enjoy', 'hiking', '.']

#### Sentence Tokenization using spacy

In [5]:
nlp = spacy.blank("en")
doc = nlp("Dr. Pinky like to read. I enjoy hiking. Let's go to N.Y!")
for word in doc:
    print(word)

Dr.
Pinky
like
to
read
.
I
enjoy
hiking
.
Let
's
go
to
N.Y
!


#### Add special case in tokenization

In [10]:
from spacy.symbols import ORTH

text = "gimme that"
doc = nlp(text)
for token in doc:
    print(token.text)

special_case = [{ORTH:"gim"}, {ORTH:"me"}]
print(special_case)
nlp.tokenizer.add_special_case("gimme", special_case)
for token in doc:
    print(token.text)


gim
me
that
[{65: 'gim'}, {65: 'me'}]
gim
me
that


#### Extract all urls from the text below

In [27]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
doc = nlp(text)
urls = []
for word in doc:
    if word.like_url:
        urls.append(word.text)
print(urls)

['http://www.data.gov/', 'http://www.science', 'http://data.gov.uk/.', 'http://www3.norc.org/gss+website/', 'http://www.europeansocialsurvey.org/.']


#### Extract all money values 

In [26]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
for i, word in enumerate(doc):
    if word.is_currency:
        print(doc[i-1])

two
500


In [29]:
nlp.pipe_names

[]

#### If you initiate spacy as blank, tokenizer will be there already. Other packages can be added in the spacy pipeline.

`en_core_web_sm` --> one of the package

In [9]:
nlp = spacy.load('en_core_web_sm')
print(nlp.pipe_names)
doc = nlp("Dr. Pinky like to read. I enjoy hiking. Let's go to N.Y and San Francisco! I have $4 money.")
for token in doc:
    print(token, " | ", token.pos_, "|", token.lemma_)
    
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Dr.  |  PROPN | Dr.
Pinky  |  PROPN | Pinky
like  |  VERB | like
to  |  PART | to
read  |  VERB | read
.  |  PUNCT | .
I  |  PRON | I
enjoy  |  VERB | enjoy
hiking  |  VERB | hike
.  |  PUNCT | .
Let  |  VERB | let
's  |  PRON | us
go  |  VERB | go
to  |  ADP | to
N.Y  |  PROPN | N.Y
and  |  CCONJ | and
San  |  PROPN | San
Francisco  |  PROPN | Francisco
!  |  PUNCT | !
I  |  PRON | I
have  |  VERB | have
$  |  SYM | $
4  |  NUM | 4
money  |  NOUN | money
.  |  PUNCT | .
Pinky | PERSON | People, including fictional
N.Y | GPE | Countries, cities, states
San Francisco | GPE | Countries, cities, states
4 | MONEY | Monetary values, including unit


In [10]:
# for nice visualization
from spacy import displacy

displacy.render(doc, style="ent")

In [13]:
t = "Bloomberg founded bloomberg in 1990"
doc = nlp(t)
displacy.render(doc, style="ent")

#### Using blank pipeline and add components

In [17]:
main_pipe = spacy.load("en_core_web_sm")
nlp = spacy.blank("en")
nlp.add_pipe("ner", source=main_pipe)
print(nlp.pipe_names)
print("-----------")
t = "Bloomberg founded bloomberg in 1990"
doc = nlp(t)
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))
print("-----------")
displacy.render(doc, style="ent")

['ner']
-----------
Bloomberg | PERSON | People, including fictional
bloomberg | ORG | Companies, agencies, institutions, etc.
1990 | DATE | Absolute or relative dates or periods
-----------


#### Lemmatization in spacy

In [20]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Dr. Pinky like reading. I enjoy hiking. Let's go to clubbing! I have $4 money.")
for token in doc:
    print(token, "->", token.lemma_)

Dr. -> Dr.
Pinky -> Pinky
like -> like
reading -> read
. -> .
I -> I
enjoy -> enjoy
hiking -> hike
. -> .
Let -> let
's -> us
go -> go
to -> to
clubbing -> club
! -> !
I -> I
have -> have
$ -> $
4 -> 4
money -> money
. -> .


In [21]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

#### add custom attribute/lemma

In [25]:
att_ruler = nlp.get_pipe("attribute_ruler")
att_ruler.add([[{"TEXT":"gimme"}]], {"LEMMA": "Give me"})
doc = nlp("gimme some sunshine. It is raining")
for token in doc:
    print(token.text, "->", token.lemma_)

gimme -> Give me
some -> some
sunshine -> sunshine
. -> .
It -> it
is -> be
raining -> rain


#### Part of Speech: PoS

In [33]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Dr. Pinky like reading. I enjoy hiking! Shall we go?")
for token in doc:
    print(token, "|", token.pos_, "|", spacy.explain(token.pos_), "|", token.tag_, "|", spacy.explain(token.tag_))

Dr. | PROPN | proper noun | NNP | noun, proper singular
Pinky | PROPN | proper noun | NNP | noun, proper singular
like | ADP | adposition | IN | conjunction, subordinating or preposition
reading | VERB | verb | VBG | verb, gerund or present participle
. | PUNCT | punctuation | . | punctuation mark, sentence closer
I | PRON | pronoun | PRP | pronoun, personal
enjoy | VERB | verb | VBP | verb, non-3rd person singular present
hiking | VERB | verb | VBG | verb, gerund or present participle
! | PUNCT | punctuation | . | punctuation mark, sentence closer
Shall | AUX | auxiliary | MD | verb, modal auxiliary
we | PRON | pronoun | PRP | pronoun, personal
go | VERB | verb | VB | verb, base form
? | PUNCT | punctuation | . | punctuation mark, sentence closer


#### Count number of POS tag in a doc

In [34]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 2, 85: 1, 100: 4, 97: 3, 95: 2, 87: 1}

In [37]:
doc.vocab[96].text, doc.vocab[85].text

('PROPN', 'ADP')

In [39]:
for k, v in count.items():
    print(doc.vocab[k].text, "->", v)

PROPN -> 2
ADP -> 1
VERB -> 4
PUNCT -> 3
PRON -> 2
AUX -> 1


#### Extract all NOUN tokens and NUM POS type, and count all POS tags in the text

In [44]:
text = '''At its core, machine learning is not a difficult concept to grasp. In fact, the vast majority of machine learning 
algorithms are concerned with just one simple task: drawing lines. In particular, 
machine learning is all about drawing lines through data. What does that mean? Let’s look at a simple example. Inflation 
rose again in April, continuing a climb that has pushed consumers to the brink and is threatening the economic expansion, the 
Bureau of Labor Statistics reported Wednesday.\n\nThe consumer price index, a broad-based measure of prices for goods 
and services, increased 8.3% from a year ago, higher than the Dow Jones estimate for an 8.1% gain.'''

doc = nlp(text)

In [51]:
noun_token = []
num_token = []
for token in doc:
    if token.pos_ == "NOUN":
        noun_token.append(token)
    elif token.pos_ == "NUM":
        num_token.append(token)
        
print("NOUN", noun_token)
print("NUM", num_token)
count = doc.count_by(spacy.attrs.POS)
for k, v in count.items():
    print(doc[k], "-", doc.vocab[k].text, ":", v)

NOUN [core, machine, learning, concept, fact, majority, machine, learning, algorithms, task, lines, machine, learning, lines, data, ’s, example, Inflation, climb, consumers, brink, expansion, consumer, price, index, measure, prices, goods, services, %, year, estimate, %, gain]
NUM [one, 8.3, 8.1]
expansion - ADP : 16
. - PRON : 4
Statistics - NOUN : 34
The - PUNCT : 17
the - AUX : 6
Wednesday - PART : 2
of - DET : 12
economic - ADJ : 7
index - VERB : 14
broad - SPACE : 6
, - ADV : 5
reported - NUM : 3


 - PROPN : 7
Bureau - CCONJ : 2


#### Stopwords

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS

In [7]:
print(STOP_WORDS)

{'seeming', 'behind', 'moreover', 'and', 'get', 'everything', 'before', 'has', 'since', 'at', 'quite', 'noone', 'was', 'however', 'anyone', 'his', 'than', 'therefore', 'whom', 'such', 'thence', 'though', 'from', 'an', 'around', 'had', 'name', 'seems', 'ca', 'mine', 'she', 'whereas', 'by', "'d", 'he', 'otherwise', 'afterwards', 'beside', 'a', 'front', 'how', 'were', 'we', 'is', 'please', 'more', 'across', 'ours', 'per', 'among', 'her', 'whatever', 'six', 'toward', 'two', 'himself', 'nevertheless', 'could', 'to', 'whereby', 'ourselves', 'sometime', 'alone', 'always', 'after', 'must', 'one', 'why', 'both', '‘d', 'became', 'them', 'that', 'along', 'three', 'hence', 'neither', 'no', 'whence', 'none', 'anyhow', 'those', 'even', 'all', 'whenever', 'i', 'me', 'regarding', 'about', 'becoming', 'call', 'herein', 'been', '‘ve', 'enough', 'without', 'thus', 'against', 'some', 'four', 'are', 'under', 'empty', 'another', 'much', 'first', 'onto', 'almost', '’ll', 'over', '‘m', 'seem', 'really', 'hund

In [8]:
"hello" in STOP_WORDS

False

In [16]:
doc = nlp("Hello, I am going to McDonalds.")
for token in doc:
    if nlp.vocab[token.text].is_stop:
        print(token.text)
    # else:
    #     print("not stop-words: ", token.text)

I
am
to


#### Named 