In [105]:
# !pip install nltk      # NLTK (Natural Language Toolkit)
# nltk.download('punkt')

# !pip install spacy
# !python3 -m spacy download en      #spacy.load('en_core_web_sm')

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import  sent_tokenize, word_tokenize  
import spacy
from spacy.symbols import ORTH
from spacy import displacy

In [28]:
nlp=spacy.load('en_core_web_sm')
doc=nlp("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")

for sentense in doc.sents:
    print(sentense, len(sentense))
    for word in sentense:
        print(word)

An end to end NLP project consists of many steps. 11
An
end
to
end
NLP
project
consists
of
many
steps
.
These steps together forms an NLP pipeline. 8
These
steps
together
forms
an
NLP
pipeline
.


In [37]:
doc=sent_tokenize("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")
print(doc)

doc=word_tokenize("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")
print(doc)

['An end to end NLP project consists of many steps.', 'These steps together forms an NLP pipeline.']
['An', 'end', 'to', 'end', 'NLP', 'project', 'consists', 'of', 'many', 'steps', '.', 'These', 'steps', 'together', 'forms', 'an', 'NLP', 'pipeline', '.']


# Make a spacy blank object that just have tokenizer

In [62]:
nlp=spacy.blank("en")  #blank lang model just have tokenizer
type(nlp)
nlp.pipe_names  #It is a blank pipeline. we should add different features to it

[]

In [44]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [103]:
# spacy token attribute are discribed below, these (.lemma_, .pos_ , .dep_) are not available for blank spacy

In [104]:
doc[0].text
doc[0].is_currency
doc[0].is_stop
doc[0].i  #token index
doc[0].like_num
doc[0].is_punct
doc[0].is_oov 

True

In [64]:
span=doc[:4]
type(span)

spacy.tokens.span.Span

In [84]:
with open("students.txt") as f:
   text=f.readlines()
    
text_tot=''.join(text)    

In [95]:
doc=nlp(text_tot)
email_list=[]
for token in doc:
    if token.like_email:
        email_list.append(token)

In [96]:
email_list

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

# Customize the nlp object

In [121]:
doc = nlp("gimme double cheese extra large healthy pizza")
tokens=[token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [115]:
nlp.tokenizer.add_special_case("gimme",[{ORTH:'gim'},{ORTH:'me'}])

In [122]:
doc = nlp("gimme double cheese extra large healthy pizza")
tokens=[token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [123]:
doc=nlp("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")

for sentense in doc.sents:
    print(sentense, len(sentense))
    for word in sentense:
        print(word)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [126]:
nlp.add_pipe('sentencizer')
nlp.pipe_names

ValueError: [E007] 'sentencizer' already exists in pipeline. Existing names: ['sentencizer']

In [128]:
doc=nlp("An end to end NLP project consists of many steps. These steps together forms an NLP pipeline.")

for sentense in doc.sents:
    print(sentense, len(sentense))
    for word in sentense:
        print(word)

An end to end NLP project consists of many steps. 11
An
end
to
end
NLP
project
consists
of
many
steps
.
These steps together forms an NLP pipeline. 8
These
steps
together
forms
an
NLP
pipeline
.


In [129]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [142]:
doc=nlp(text)
url_list=[]
for token in doc:
    if token.like_url:
        url_list.append(token.text)       

In [143]:
url_list

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [144]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

In [148]:
doc=nlp(transactions)
for token in doc:
    if token.like_num:
        if doc[(token.i)+1].is_currency:  
            print(token.text, doc[(token.i)+1])       

two $
500 €


# Make a spacy pipeline contains: tagger(.pos_), parser, lemmatizer(.lemma_),ner(.ent)

In [43]:
nlp=spacy.load('en_core_web_sm')
nlp.pipe_names
#nlp.pipeline

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [44]:
doc = nlp("Captain america ate 100$ of samosa. Then he said I can do this all day.")
for token in doc:
    print(token," | ",token.pos_, spacy.explain(token.pos_), " | ",token.lemma_)

Captain  |  PROPN proper noun  |  Captain
america  |  PROPN proper noun  |  america
ate  |  VERB verb  |  eat
100  |  NUM numeral  |  100
$  |  NUM numeral  |  $
of  |  ADP adposition  |  of
samosa  |  PROPN proper noun  |  samosa
.  |  PUNCT punctuation  |  .
Then  |  ADV adverb  |  then
he  |  PRON pronoun  |  he
said  |  VERB verb  |  say
I  |  PRON pronoun  |  I
can  |  AUX auxiliary  |  can
do  |  VERB verb  |  do
this  |  PRON pronoun  |  this
all  |  DET determiner  |  all
day  |  NOUN noun  |  day
.  |  PUNCT punctuation  |  .


In [162]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text ," | ", ent.label_)

Tesla Inc  |  ORG
$45 billion  |  MONEY


In [165]:
displacy.render(doc, style="ent")

In [166]:
displacy.render(doc, style="dep")

# add_pipe to the blank pipeline

In [None]:
nlp_source=spacy.load('en_core_web_sm')
nlp=spacy.blank("en")
#nlp.pipeline
nlp.add_pipe("ner", source=nlp_source)
nlp.pipe_names

# spacy for swedish lang processing

In [39]:
#!python3 -m spacy download sv_core_news_sm
#!python3 -m spacy download sv_core_news_lg

In [46]:
nlp = spacy.load("sv_core_news_sm")
nlp.pipe_names

['tok2vec',
 'tagger',
 'morphologizer',
 'parser',
 'lemmatizer',
 'attribute_ruler',
 'ner']

In [47]:
doc = nlp("Tesla Inc kommer att förvärva twitter för 45 miljarder dollar")

for token in doc:
    print(token," | ",token.pos_ , spacy.explain(token.pos_)," | ", token.lemma_)

Tesla  |  ADJ adjective  |  tesla
Inc  |  NOUN noun  |  Inc
kommer  |  AUX auxiliary  |  komma
att  |  PART particle  |  att
förvärva  |  VERB verb  |  förvärva
twitter  |  NOUN noun  |  twitt
för  |  ADP adposition  |  för
45  |  NUM numeral  |  45
miljarder  |  NOUN noun  |  miljard
dollar  |  NOUN noun  |  doll


In [32]:
nlp_source=spacy.load('sv_core_news_lg')

doc = nlp_source("Tesla Inc kommer att förvärva twitter för 45 miljarder dollar")
for ent in doc.ents:
    print(ent.text," | ",ent.label_)

45 miljarder dollar  |  MSR


In [41]:
#read more in https://spacy.io/usage/processing-pipelines#pipelines