## Spacy
spaCy is an open-source software library for advanced Natural Language Processing, written in the programming languages Python and Cython.Unlike NLTK, which is widely used for teaching and research, spaCy focuses on providing software for production usage

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

### TOKENIZATION
One common task in NLP (Natural Language Processing) is tokenization. "Tokens" are usually individual words (at least in languages like English) and "tokenization" is taking a text or set of text and breaking it up into its individual words.

In [7]:
nlp

example1 = nlp("HI My name is Nikita.I will teach you NLP ")
for element in example1 :
    print(element)

HI
My
name
is
Nikita
.
I
will
teach
you
NLP


In [9]:
example2 = nlp("The quick brown fox jumps over the lazy dog")

for token in example2:
    print(token.text)

The
quick
brown
fox
jumps
over
the
lazy
dog


### STEMMING
Stemming is the process of reducing a word to its word stem 
that affixes to suffixes and prefixes or to the roots of words known
as a lemma. 

In [38]:
## from nltk import stem
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()


In [39]:
example = "Cats Running Was"
[stemmer.stem(token) for token in example.split(' ')]

['cat', 'run', 'wa']

##### above example it has splited the data in its core form

In [40]:
example2 = "You better lose yourself in the music, the moment "\
+ "You own it, you better never let it go "\
+ "You only get one shot, do not miss your chance to blow "\
+ "This opportunity comes once in a lifetime "
[stemmer.stem(token) for token in example2.split(' ')]

['you',
 'better',
 'lose',
 'yourself',
 'in',
 'the',
 'music,',
 'the',
 'moment',
 'you',
 'own',
 'it,',
 'you',
 'better',
 'never',
 'let',
 'it',
 'go',
 'you',
 'onli',
 'get',
 'one',
 'shot,',
 'do',
 'not',
 'miss',
 'your',
 'chanc',
 'to',
 'blow',
 'thi',
 'opportun',
 'come',
 'onc',
 'in',
 'a',
 'lifetim',
 '']

In [41]:
print(" ".join(example2))

Y o u   b e t t e r   l o s e   y o u r s e l f   i n   t h e   m u s i c ,   t h e   m o m e n t   Y o u   o w n   i t ,   y o u   b e t t e r   n e v e r   l e t   i t   g o   Y o u   o n l y   g e t   o n e   s h o t ,   d o   n o t   m i s s   y o u r   c h a n c e   t o   b l o w   T h i s   o p p o r t u n i t y   c o m e s   o n c e   i n   a   l i f e t i m e  


### Lemmatization

In [46]:

nlp = spacy.load('en_core_web_sm')

In [47]:
example = nlp("is am are")
for token in example:
    print(token.lemma_)

be
be
be


In [48]:
example2 = "You better lose yourself in the music, the moment "\
+ "You own it, you better never let it go " \
+ "You only get one shot, do not miss your chance to blow "\
+ "This opportunity comes once in a lifetime"

example3 = nlp(example2)

for token in example3:
    print(token.lemma_)


-PRON-
better
lose
-PRON-
in
the
music
,
the
moment
-PRON-
own
-PRON-
,
-PRON-
better
never
let
-PRON-
go
-PRON-
only
get
one
shot
,
do
not
miss
-PRON-
chance
to
blow
this
opportunity
come
once
in
a
lifetime


### VECTORIZATION

The process of converting NLP text into numbers is called vectorization in ML

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, token_pattern=r'\b[^\d\W]+\b')

In [50]:
corpus = ["The dog is on the table", "the cats now are on the table"]
vectorizer.fit(corpus)
print(vectorizer.transform(["The dog is on the table"]).toarray())

[[0 0 1 1 0 1 1 1]]


In [51]:
vocab = vectorizer.vocabulary_

for key in sorted(vocab.keys()):
    print("{}: {}".format(key, vocab[key]))


are: 0
cats: 1
dog: 2
is: 3
now: 4
on: 5
table: 6
the: 7


In [52]:
corpus2 = ["I am jack", "You are john", "I am john"]
vectorizer.fit(corpus2)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\b[^\\d\\W]+\\b',
        tokenizer=None, vocabulary=None)

In [53]:
print(vectorizer.transform(corpus2).toarray())

[[1 0 1 1 0 0]
 [0 1 0 0 1 1]
 [1 0 1 0 1 0]]


In [54]:
vocab = vectorizer.vocabulary_

for key in sorted(vocab.keys()):
    print("{}: {}".format(key, vocab[key]))

am: 0
are: 1
i: 2
jack: 3
john: 4
you: 5


### EMBEDDING

In [55]:
### Embedding (word2vec) references
nlp = spacy.load('en_core_web_lg')

In [56]:
example1 = "man woman king queen"
tokens = nlp(example1)
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))


man man 1.0
man woman 0.7401744
man king 0.40884617
man queen 0.27109137
woman man 0.7401744
woman woman 1.0
woman king 0.26556596
woman queen 0.4066065
king man 0.40884617
king woman 0.26556596
king king 1.0
king queen 0.72526103
queen man 0.27109137
queen woman 0.4066065
queen king 0.72526103
queen queen 1.0


In [57]:
example2 ='table chair book pen pencil';
example2 = nlp(example2)
for token3 in example2:
    for token4 in example2:
        print(token3.text , token4.text ,token1.similarity(token4))

table table 0.29582644
table chair 0.3401051
table book 0.16623081
table pen 0.12119902
table pencil 0.13445242
chair table 0.29582644
chair chair 0.3401051
chair book 0.16623081
chair pen 0.12119902
chair pencil 0.13445242
book table 0.29582644
book chair 0.3401051
book book 0.16623081
book pen 0.12119902
book pencil 0.13445242
pen table 0.29582644
pen chair 0.3401051
pen book 0.16623081
pen pen 0.12119902
pen pencil 0.13445242
pencil table 0.29582644
pencil chair 0.3401051
pencil book 0.16623081
pencil pen 0.12119902
pencil pencil 0.13445242


In [58]:
example1 = "spain russia madrid moscow"
tokens = nlp(example1)
for token1 in tokens:
    for token2 in tokens:
        if(token1.text == token2.text):
            continue
        print(token1.text, token2.text, token1.similarity(token2))


spain russia 0.57819444
spain madrid 0.71929747
spain moscow 0.5162205
russia spain 0.57819444
russia madrid 0.43594515
russia moscow 0.7492537
madrid spain 0.71929747
madrid russia 0.43594515
madrid moscow 0.5473875
moscow spain 0.5162205
moscow russia 0.7492537
moscow madrid 0.5473875


In [59]:
example1 = "cat dog"
tokens = nlp(example1)
for token1 in tokens:
    for token2 in tokens:
        if(token1.text == token2.text):
            continue        
        print(token1.text, token2.text, token1.similarity(token2))


cat dog 0.80168545
dog cat 0.80168545


In [60]:

##Named Entity Recognition (NER)

In [61]:
import spacy

nlp = spacy.load('en_core_web_sm')
example = "Google, a company founded by Larry Page and Sergey Brin in the United States of America "\
+ "has one of the world’s most advanced search engines."

doc = nlp(example)

for ent in doc.ents:
    print(ent.text, ent.label_)

Google ORG
Larry Page PERSON
Sergey Brin ORG
the United States of America GPE
one CARDINAL


In [62]:
example = "U.S. officials are meeting with former Taliban members "\
+ "amid intensifying efforts to wind down America's longest war, three of the "\
+ "militant group's commanders told NBC News."

doc = nlp(example)

for ent in doc.ents:
    print(ent.text, ent.label_)

U.S. GPE
Taliban ORG
America GPE
three CARDINAL
NBC News ORG
