### Understanding Basic NLP Topics
- Tokenization
- Stemming
- Lemmatization
- Stop Words

##### Spacy Setup

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm') # Load the model

In [3]:
doc = nlp(u'My name is Kaushal and I am a student.')

In [8]:
for token in doc:
    print(token.text, token.pos_, token.dep_) # token.text is the token itself, token.pos_ is the part of speech, token.dep_ is the dependency

My PRON poss
name NOUN nsubj
is AUX ROOT
Kaushal PROPN attr
and CCONJ cc
I PRON nsubj
am AUX conj
a DET det
student NOUN attr
. PUNCT punct


In [9]:
nlp.pipeline # List of the pipeline components

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x21b8ded9360>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x21b8ded96c0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x21b8dc45540>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x21b8df6ccc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x21b8dd94fc0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x21b8dc45700>)]

In [10]:
nlp.pipe_names # List of the pipeline components

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [13]:
doc2 = nlp(u"Tesla isn't      looking into startups anymore.")

In [14]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Tesla NOUN nsubj
is AUX ROOT
n't PART neg
      SPACE dep
looking VERB advcl
into ADP prep
startups NOUN pobj
anymore ADV advmod
. PUNCT punct


In [15]:
doc2[0].pos_

'NOUN'

In [16]:
doc2[1].dep_

'ROOT'

In [17]:
doc2[2].is_alpha # Check if the token is alphabetic

False

In [18]:
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Day" the phrase "Life is a song" was first used in the movie "Life Is what happens to us while we are making others plans" ') 

In [19]:
life_quote = doc3[16:30]

In [20]:
print(life_quote)

Life is a song" was first used in the movie "Life Is


In [21]:
type(life_quote)

spacy.tokens.span.Span

In [22]:
type(doc3)

spacy.tokens.doc.Doc

In [23]:
doc4 = nlp(u'This is the first sentence. This is the second sentence. This is the third sentence.')

In [24]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is the second sentence.
This is the third sentence.


In [27]:
doc4[6].is_sent_start # Check if the token is the start of a sentence
doc4[8].is_sent_end # Check if the token is the end of a sentence

False