In [1]:
!pip install spacy



In [2]:
!python -m spacy download en

[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
symbolic link created for C:\Users\Nikkitha\Anaconda3\lib\site-packages\spacy\data\en <<===>> C:\Users\Nikkitha\Anaconda3\lib\site-packages\en_core_web_sm
[+] Linking successful
C:\Users\Nikkitha\Anaconda3\lib\site-packages\en_core_web_sm -->
C:\Users\Nikkitha\Anaconda3\lib\site-packages\spacy\data\en
You can now load the model via spacy.load('en')


Tokenizing the text - breaking text into pieces, and ignoring punctuations

In [18]:
from spacy.lang.en import English
nlp = English()
text = "This series, loosely based Philip K. Dick's novel of the same name, takes a look at what the world might look like had the outcome of World War II turned out differently."
doc = nlp(text)

#create a list of words
token_list = []
for token in doc:
    token_list.append(token.text)
print(token_list)

['This', 'series', ',', 'loosely', 'based', 'Philip', 'K.', 'Dick', "'s", 'novel', 'of', 'the', 'same', 'name', ',', 'takes', 'a', 'look', 'at', 'what', 'the', 'world', 'might', 'look', 'like', 'had', 'the', 'outcome', 'of', 'World', 'War', 'II', 'turned', 'out', 'differently', '.']


Stop words - extremely common words which would appear to be of little value in helping select documents matching a user word.

In [19]:
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

print("List of stop words:%d" %len(spacy_stopwords))
print("Stop words: %s" %list(spacy_stopwords))

List of stop words:326
Stop words: ['yourself', 'never', 'own', 'namely', 'with', 'in', 'can', 'this', 'still', 'again', 'them', 'latter', 'any', 'up', 'but', 'same', 'yours', '‘re', 'his', 'the', "'d", 'another', 'upon', 'twenty', 'take', 'i', 'whom', 'nothing', 'me', 'these', 'part', 'five', 'done', 'just', 'besides', 'became', 'nevertheless', 'together', 'even', '’d', 'always', 'empty', 'hereby', 'sometime', 'ten', 'to', 'becomes', 'could', 'within', '‘ll', 'although', 'from', 'back', 'put', '‘ve', 'everything', 'someone', 'full', 'whither', 'everyone', 'or', 'whereby', 'keep', 'unless', 'elsewhere', 'moreover', 'whether', 'down', 'third', 'would', 'amount', 'a', 'must', 'hundred', 'have', 'sometimes', 'last', 'only', 'give', 'here', 'everywhere', 'was', 'every', 'off', 'whereas', 'above', 'along', "'s", 'am', 'whence', 'whenever', 'thereby', 'due', 'wherever', 'either', 'did', 'fifteen', 'make', "'ve", 'call', 'many', 'often', 'of', 'indeed', 'ourselves', 'whereafter', 'whole', 'wh

In [20]:
from spacy.lang.en.stop_words import STOP_WORDS

filtered_sent = []
doc1 = nlp(text)

for word in doc1:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence: ",filtered_sent)

Filtered Sentence:  [series, ,, loosely, based, Philip, K., Dick, novel, ,, takes, look, world, look, like, outcome, World, War, II, turned, differently, .]


Lemmatization - process of converting a word to its base form. The difference between stemming and lemmatization is, lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.

In [21]:
lem = nlp("run running runs runner")

for word1 in lem:
    print(word1.text,word1.lemma_)

run run
running run
runs run
runner runner


Parts of speech tagging

In [25]:
import en_core_web_sm

nlp = en_core_web_sm.load()

doc2 = nlp("takes a look at what the world might look like had the outcome of World War II turned out differently.")

for word2 in doc2:
    print(word2.text, word2.pos_)

takes VERB
a DET
look NOUN
at ADP
what PRON
the DET
world NOUN
might VERB
look VERB
like ADP
had VERB
the DET
outcome NOUN
of ADP
World PROPN
War PROPN
II PROPN
turned VERB
out PART
differently ADV
. PUNCT


Entity Detection - Identifies important elements like places, people, organizations, and languages within an input string of text.

In [32]:
from spacy import displacy
import spacy

texts = nlp("The Man in the High Castle is an American alternate history television series depicting a parallel universe where the Axis powers win World War II. It was created by Frank Spotnitz and is produced by Amazon Studios, Scott Free Productions, Headline Pictures, Electric Shepherd Productions, and Big Light Productions. The series is based on Philip K. Dick's 1962 novel of the same name.")
for i in texts.ents:
    print(i.text,i.label_)

American NORP
Axis ORG
World War II EVENT
Frank Spotnitz PERSON
Amazon Studios ORG
Scott Free Productions ORG
Headline Pictures ORG
Electric Shepherd Productions ORG
Big Light Productions ORG
Philip K. Dick's PERSON
1962 DATE


In [35]:
displacy.render(texts, style="ent", jupyter=True)

Dependency Parsing - The task of extracting a dependency parse of a sentence that represents its grammatical structure and defines the relationships between “head” words and words, which modify those heads.

In [37]:
document = nlp("People say nothing is impossible, but I do nothing every day")
for chunk in document.noun_chunks:
    print(chunk.text,"#", chunk.root.text,"#", chunk.root.dep_,"#", chunk.root.head.text)

People # People # nsubj # say
nothing # nothing # nsubj # is
I # I # nsubj # do
nothing # nothing # dobj # do


In [38]:
displacy.render(document, style="dep", jupyter=True)