In [2]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Dr. Strange loves pizza of 14th street. Hulk loves KFC burger")

In [14]:
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pizza of 14th street.
Hulk loves KFC burger


In [4]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

Dr.
Strange
loves
pizza
of
14th
street
.
Hulk
loves
KFC
burger


In [7]:
import nltk
nltk.download('punkt') # this is extra library to download otherwise next step cause error

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\talha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
from nltk.tokenize import sent_tokenize

sent_tokenize("Dr. Strange loves pizza of 14th street. Hulk loves KFC burger")

['Dr.', 'Strange loves pizza of 14th street.', 'Hulk loves KFC burger']

In [9]:
from nltk.tokenize import word_tokenize

word_tokenize("Dr. Strange loves pizza of 14th street. Hulk loves KFC burger")

['Dr',
 '.',
 'Strange',
 'loves',
 'pizza',
 'of',
 '14th',
 'street',
 '.',
 'Hulk',
 'loves',
 'KFC',
 'burger']

In [29]:
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pizza of 14th street and price of large pizza is 15 $.")

In [44]:
for token in doc:
    print(token, token.is_alpha, token.like_num, token.is_currency)

Dr. False False False
Strange True False False
loves True False False
pizza True False False
of True False False
14th False True False
street True False False
and True False False
price True False False
of True False False
large True False False
pizza True False False
is True False False
15 False True False
$ False False True
. False False False


In [31]:
token = doc[1]
token

Strange

In [32]:
type(nlp)

spacy.lang.en.English

In [33]:
type(doc)

spacy.tokens.doc.Doc

In [34]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [38]:
token.is_alpha

True

In [39]:
token1 = doc[13]
token1
token1.like_num

True

In [42]:
token2 = doc[14]
token2
token2.is_currency

True

In [43]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Dr. ==> index:  0 is_alpha: False is_punct: False like_num: False is_currency: False
Strange ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
loves ==> index:  2 is_alpha: True is_punct: False like_num: False is_currency: False
pizza ==> index:  3 is_alpha: True is_punct: False like_num: False is_currency: False
of ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
14th ==> index:  5 is_alpha: False is_punct: False like_num: True is_currency: False
street ==> index:  6 is_alpha: True is_punct: False like_num: False is_currency: False
and ==> index:  7 is_alpha: True is_punct: False like_num: False is_currency: False
price ==> index:  8 is_alpha: True is_punct: False like_num: False is_currency: False
of ==> index:  9 is_alpha: True is_punct: False like_num: False is_currency: False
large ==> index:  10 is_alpha: True is_punct: False like_num: False is_currency: False
pizza ==> index:  11 is_alpha: True is_punct: False like_num

In [45]:
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [46]:
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
#This will not change the text and only split the text
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

# Exercise

In [47]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc = nlp(text)

In [51]:
token = []
for i in doc:
    if i.like_url:
        token.append(i)
token

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

In [53]:
token = [token for token in doc if token.like_url]
token

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

In [54]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)

In [57]:
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)

two $
500 €


In [4]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("gimme double cheese extra large healthy pizza")
for token in doc:
    print(token ," | ", token.pos_ , " | " , token.lemma_) 

gimme  |  ADJ  |  gimme
double  |  ADJ  |  double
cheese  |  NOUN  |  cheese
extra  |  ADJ  |  extra
large  |  ADJ  |  large
healthy  |  ADJ  |  healthy
pizza  |  NOUN  |  pizza


In [5]:
set = nlp("BMW is generation 4.5 billion $ revenue")
for ent in set.ents:
    print(ent , " | ", ent.label_, " | ", spacy.explain(ent.label_))

BMW  |  ORG  |  Companies, agencies, institutions, etc.
4.5 billion $  |  MONEY  |  Monetary values, including unit


In [8]:
from spacy import displacy

displacy.render(set, style='ent')

In [10]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']