In [None]:
'''
-- Language models: SpaCy and NLTK (Natural Language Toolkit) for sentense and word tokenizing. SpaCy is faster and better for production; NLTK is great for research and education.
-- SpaCy pipeline consists of different attributes: tagger(.pos_), parser, lemmatizer(.lemma_), ner(.ent) and dependency (.dep_) read more in https://spacy.io/usage/processing-pipelines#pipelines
-- Stemming uses simple rules to drive base word but Lemmatization uses knowledge of language to drive the base word. spacy doesnt has stemming, NLTK has both stemming and lemma
-- Difference of POS and Tag: coarse-grained vs. fine-grained category of a word
-- Name Entity Recognition (NER) for Person, Company, Product, Location, Money and how to customize entities
-- Stop words carry little meaningful information and are ignored in text processing tasks.like articles, prepositions, pronouns, conjunctions
'''

In [105]:
# !pip install nltk    
# nltk.download('punkt')
# !pip install spacy
# !python3 -m spacy download en

In [121]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import  sent_tokenize, word_tokenize  
import spacy
from spacy.symbols import ORTH
from spacy import displacy
from nltk.stem import PorterStemmer, SnowballStemmer 
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
nlp=spacy.load('en_core_web_sm')
doc=nlp("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")

for sentense in doc.sents:
    print(sentense, len(sentense))
    for word in sentense:
        print(word)

An end to end NLP project consists of many steps. 11
An
end
to
end
NLP
project
consists
of
many
steps
.
These steps together forms an NLP pipeline. 8
These
steps
together
forms
an
NLP
pipeline
.


In [4]:
doc=sent_tokenize("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")
print(doc)

doc=word_tokenize("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")
print(doc)

['An end to end NLP project consists of many steps.', 'These steps together forms an NLP pipeline.']
['An', 'end', 'to', 'end', 'NLP', 'project', 'consists', 'of', 'many', 'steps', '.', 'These', 'steps', 'together', 'forms', 'an', 'NLP', 'pipeline', '.']


# Make a spacy blank object that just have tokenizer

In [46]:
nlp=spacy.blank("en")  #blank lang model just have tokenizer. a blank pipeline need to add different features to it
nlp.pipe_names

[]

In [20]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [21]:
doc[0].text
doc[0].is_currency
doc[0].is_stop
doc[0].i  #token index
doc[0].like_num
doc[0].is_punct
doc[0].is_oov 
doc[0].like_email

False

In [23]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc=nlp(text)
url_list=[]
for token in doc:
    if token.like_url:
        url_list.append(token.text)

url_list


['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [24]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

doc=nlp(transactions)
for token in doc:
    if token.like_num:
        if doc[(token.i)+1].is_currency:  
            print(token.text, doc[(token.i)+1])       

two $
500 €


# Add components to the blank pipeline uisng add_pipe

In [48]:
nlp_source=spacy.load('en_core_web_sm')
nlp.add_pipe('senter',source=nlp_source)
nlp.pipe_names

['senter']

In [49]:
doc=nlp("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")

for sentense in doc.sents:
    print(sentense, len(sentense))
    for word in sentense:
        print(word)

An end to end NLP project consists of many steps. 11
An
end
to
end
NLP
project
consists
of
many
steps
.
These steps together forms an NLP pipeline. 8
These
steps
together
forms
an
NLP
pipeline
.


In [28]:
nlp.add_pipe("ner", source=nlp_source)
nlp.pipe_names

['senter', 'ner']

In [64]:
span=doc[:4]
type(span)

spacy.tokens.span.Span

# spacy pipeline contains: tagger(.pos_), parser, lemmatizer(.lemma_),ner(.ent)

In [29]:
nlp=spacy.load('en_core_web_sm')
nlp.pipe_names
#nlp.pipeline

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [30]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")
for token in doc:
    print(token," | ",token.pos_, spacy.explain(token.pos_), " | ",token.lemma_)

Captain  |  PROPN proper noun  |  Captain
america  |  PROPN proper noun  |  america
ate  |  VERB verb  |  eat
100  |  NUM numeral  |  100
$  |  NUM numeral  |  $
of  |  ADP adposition  |  of
samosa  |  PROPN proper noun  |  samosa
.  |  PUNCT punctuation  |  .
Then  |  ADV adverb  |  then
he  |  PRON pronoun  |  he
said  |  VERB verb  |  say
I  |  PRON pronoun  |  I
can  |  AUX auxiliary  |  can
do  |  VERB verb  |  do
this  |  PRON pronoun  |  this
all  |  DET determiner  |  all
day  |  NOUN noun  |  day
.  |  PUNCT punctuation  |  .


In [89]:
doc = nlp("Tesla Inc is going to acquire Twitter for $45 billion")

for ent in doc.ents:
    print(ent.text ," | ", ent.label_)

Tesla Inc  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


In [90]:
displacy.render(doc, style="ent")

In [33]:
displacy.render(doc, style="dep")

# SpaCy for swedish lang

In [39]:
#!python3 -m spacy download sv_core_news_sm
#!python3 -m spacy download sv_core_news_lg

In [34]:
nlp = spacy.load("sv_core_news_sm")
nlp.pipe_names

['tok2vec',
 'tagger',
 'morphologizer',
 'parser',
 'lemmatizer',
 'attribute_ruler',
 'ner']

In [37]:
doc = nlp("Tesla Inc kommer att förvärva twitter för 45 miljarder dollar")

for token in doc:
    print(token," | ",token.pos_ , spacy.explain(token.pos_)," | ", token.lemma_)

Tesla  |  ADJ adjective  |  tesla
Inc  |  NOUN noun  |  Inc
kommer  |  AUX auxiliary  |  komma
att  |  PART particle  |  att
förvärva  |  VERB verb  |  förvärva
twitter  |  NOUN noun  |  twitt
för  |  ADP adposition  |  för
45  |  NUM numeral  |  45
miljarder  |  NOUN noun  |  miljard
dollar  |  NOUN noun  |  doll


In [39]:
nlp_source=spacy.load('sv_core_news_lg')

doc = nlp_source("Tesla Inc kommer att förvärva twitter för 45 miljarder dollar")
for ent in doc.ents:
    print(ent.text," | ",ent.label_)

45 miljarder dollar  |  MSR


# Customize the NLP object

In [43]:
doc = nlp("gimme double cheese extra large healthy pizza")
tokens=[token.text for token in doc]
tokens

nlp.tokenizer.add_special_case("gimme",[{ORTH:'gim'},{ORTH:'me'}])

tokens=[token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

# Stemming and Lemmatization

In [52]:
stemmer=PorterStemmer()

words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]
for word in words:
    print(word, " | ", stemmer.stem(word))

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


In [53]:
language = 'english'
stemmer = SnowballStemmer(language)

words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]
for word in words:
    print(word, " | ", stemmer.stem(word))

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  ate
adjustable  |  adjust
rafting  |  raft
ability  |  abil
meeting  |  meet


In [59]:
nlp=spacy.load("en_core_web_sm")

doc = nlp('eating eats eat ate adjustable rafting ability meeting')
for token in doc:
    print(token, " | ", token.lemma_) # token.lemma

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meeting


# POS vs. Tag
#### POS (Part-of-Speech): Refers to the coarse-grained category of a word, NOUN, VERB, ADJ, ADV 
#### Tag fine-grained category, specific information about the grammatical properties, NN (singular noun), NNS (plural noun), VBZ (verb, 3rd person singular present)

In [82]:
nlp=spacy.load('en_core_web_sm')

doc = nlp("Elon flew to mars yesterday.")
for token in doc:
    print(token.text, " | ", token.pos_," | ", spacy.explain(token.pos_), " | ",token.pos)
    print(token.text, " | ", token.tag_," | ", spacy.explain(token.tag_), " | ",token.tag)

Elon  |  PROPN  |  proper noun  |  96
Elon  |  NNP  |  noun, proper singular  |  15794550382381185553
flew  |  VERB  |  verb  |  100
flew  |  VBD  |  verb, past tense  |  17109001835818727656
to  |  ADP  |  adposition  |  85
to  |  IN  |  conjunction, subordinating or preposition  |  1292078113972184607
mars  |  NOUN  |  noun  |  92
mars  |  NNS  |  noun, plural  |  783433942507015291
yesterday  |  NOUN  |  noun  |  92
yesterday  |  NN  |  noun, singular or mass  |  15308085513773655218
.  |  PUNCT  |  punctuation  |  97
.  |  .  |  punctuation mark, sentence closer  |  12646065887601541794


In [83]:
count= doc.count_by(spacy.attrs.POS)
for a,b in count.items():
    print(doc.vocab[a].text, " | ", b )

PROPN  |  1
VERB  |  1
ADP  |  1
NOUN  |  2
PUNCT  |  1


## Name Entity Recognition (NER) for Person, Company, Product, Location, Money

In [97]:
nlp=spacy.load('en_core_web_sm')
#nlp.pipe_names
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [118]:
doc = nlp("Tesla Inc is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))
    

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
Twitter  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
$45 billion  |  MONEY  |  Monetary values, including unit


# Stop Words

In [128]:
len(STOP_WORDS)

326

In [131]:
nlp=spacy.load("en_core_web_sm")

doc = nlp("We just opened our wings, the flying part is coming soon")
for token in doc:
    if token.is_stop:
        print(token)


filter=[token.text for token in doc 
        if not token.is_stop and not token.is_punct and not token.text in ["\n"," "]]
final_doc=" ".join(filter)
final_doc

We
just
our
the
part
is


'opened wings flying coming soon'

In [134]:
# Customize stop words in the text
nlp.vocab['part'].is_stop = False