# NLP with Spacy

- installation : https://spacy.io/usage

~~~
$pip install -U spacy
$python -m spacy download en_core_web_sm
~~~

In [8]:
import spacy

## load English model and instantiate
nlp = spacy.load("en_core_web_sm")

# tokenization and lemmatization

https://spacy.io/usage/linguistic-features#lemmatization

In [9]:
text = "I got no car and it's breaking my heart. But I've found a driver and that's a start"

## process text
doc = nlp(text)

In [10]:
## doc is iterable. it gives token
for token in doc:
    print(token)

I
got
no
car
and
it
's
breaking
my
heart
.
But
I
've
found
a
driver
and
that
's
a
start


In [11]:
## token.lemma_ gives lemmatized form
for token in doc:
    print(token, token.lemma_, sep='\t')

I	I
got	get
no	no
car	car
and	and
it	it
's	be
breaking	break
my	my
heart	heart
.	.
But	but
I	I
've	've
found	find
a	a
driver	driver
and	and
that	that
's	be
a	a
start	start


In [12]:
## make token list
tokens = [token.lemma_ for token in doc]
tokens

['I',
 'get',
 'no',
 'car',
 'and',
 'it',
 'be',
 'break',
 'my',
 'heart',
 '.',
 'but',
 'I',
 "'ve",
 'find',
 'a',
 'driver',
 'and',
 'that',
 'be',
 'a',
 'start']

# POS tagging

https://spacy.io/usage/linguistic-features#pos-tagging

In [7]:
## token.pos_ gives lemmatized form
for token in doc:
    print(token, token.pos_, sep='\t')

I	PRON
got	VERB
no	DET
car	NOUN
and	CCONJ
it	PRON
's	AUX
breaking	VERB
my	PRON
heart	NOUN
.	PUNCT
But	CCONJ
I	PRON
've	AUX
found	VERB
a	DET
driver	NOUN
and	CCONJ
that	PRON
's	AUX
a	DET
start	NOUN


# Stopword

In [13]:
## token.is_stop is boolean 
for token in doc:
    print(token, token.is_stop, sep='\t')

I	True
got	False
no	True
car	False
and	True
it	True
's	True
breaking	False
my	True
heart	False
.	False
But	True
I	True
've	True
found	False
a	True
driver	False
and	True
that	True
's	True
a	True
start	False


In [14]:
## make lemmatized token list without stopwords
tokens = [token.lemma_ for token in doc if token.is_stop == False]
tokens

['get', 'car', 'break', 'heart', '.', 'find', 'driver', 'start']