In [23]:
import spacy
from spacy.lang.en.examples import sentences
import pandas as pd
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
#from nltk.stem import PorterStemmer


def parser(text, nlp, columns = ['text','pos','tag','shape', 'lemma', 'alpha','stop']):
    doc = nlp(text)
    features = []
    for token in doc:
        features.append (
            {
                'text':token.text,
                #'lemma': token.lemma_,
                'pos': token.pos_,
                'tag': token.tag_,
                'shape': token.shape_,
                'lemma':token.lemma_, 
                'alpha':token.is_alpha,
                'stop' : token.is_stop

                #'dep': token.dep_
            }
        )
    return pd.DataFrame(features, columns = columns)


In [24]:
text = """Caution prevails over Trump's legal woes and U.S.A-China trade talks""" # Tokenization
parser(text, nlp)




Unnamed: 0,text,pos,tag,shape,lemma,alpha,stop
0,Caution,NOUN,NN,Xxxxx,caution,True,False
1,prevails,VERB,VBZ,xxxx,prevail,True,False
2,over,ADP,IN,xxxx,over,True,True
3,Trump,PROPN,NNP,Xxxxx,trump,True,False
4,'s,PART,POS,'x,'s,False,False
5,legal,ADJ,JJ,xxxx,legal,True,False
6,woes,NOUN,NNS,xxxx,woe,True,False
7,and,CCONJ,CC,xxx,and,True,True
8,U.S.A,PROPN,NNP,X.X.X,u.s.a,False,False
9,-,PUNCT,HYPH,-,-,False,False


In [8]:
#Step1: sentence Segmentation
doc = nlp(u"London is the capital and most populous city of England and United Kingdom. \
          Standing on the river thames in the south east corner of the island of Great Britain, \
          London has been a major settlement for two millennia. It was founded by Romans who named it Londinium.\
          London's ancient core, the city of London, largely retains its 1.12-square-mile medieval boundaries")
for sent in doc.sents:
    print(sent.text)
    

London is the capital and most populous city of England and United Kingdom.           
Standing on the river thames in the south east corner of the island of Great Britain,           London has been a major settlement for two millennia.
It was founded by Romans who named it Londinium.          
London's ancient core, the city of London, largely retains its 1.12-square-mile medieval boundaries


In [7]:
#Sentence Segmentation with custom biundaries...
text = u"this is a sentence...hello...and another sentence."
doc = nlp(text)

print('Before:', [sent.text for sent in doc.sents])

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == '...':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before='parser')
doc = nlp(text)
print('After:', [sent.text for sent in doc.sents])

Before: ['this is a sentence...', 'hello...and another sentence.']
After: ['this is a sentence...', 'hello...', 'and another sentence.']


In [11]:
#Step2 - Tokenization
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)
    

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [26]:
#Step3 POS Tagging
text= "Apple is looking at buying U.K. startup for $1 billion"
parser(text, nlp, columns= ['text', 'pos','tag', 'shape', 'alpha'])
    #print(token.text, token.pos_, token.tag_ )

Unnamed: 0,text,pos,tag,shape,alpha
0,Apple,PROPN,NNP,Xxxxx,True
1,is,VERB,VBZ,xx,True
2,looking,VERB,VBG,xxxx,True
3,at,ADP,IN,xx,True
4,buying,VERB,VBG,xxxx,True
5,U.K.,PROPN,NNP,X.X.,False
6,startup,NOUN,NN,xxxx,True
7,for,ADP,IN,xxx,True
8,$,SYM,$,$,False
9,1,NUM,CD,d,False


In [27]:
#step4 lemmatization

text= "Apple is looking at buying U.K. startup for $1 billion"
parser(text, nlp, columns= ['text', 'pos','tag', 'shape', 'alpha', 'lemma'])


Unnamed: 0,text,pos,tag,shape,alpha,lemma
0,Apple,PROPN,NNP,Xxxxx,True,apple
1,is,VERB,VBZ,xx,True,be
2,looking,VERB,VBG,xxxx,True,look
3,at,ADP,IN,xx,True,at
4,buying,VERB,VBG,xxxx,True,buy
5,U.K.,PROPN,NNP,X.X.,False,u.k.
6,startup,NOUN,NN,xxxx,True,startup
7,for,ADP,IN,xxx,True,for
8,$,SYM,$,$,False,$
9,1,NUM,CD,d,False,1


In [28]:
#Step 5: Identifying Stop Words
text= "Apple is looking at buying U.K. startup for $1 billion"
parser(text, nlp)


Unnamed: 0,text,pos,tag,shape,lemma,alpha,stop
0,Apple,PROPN,NNP,Xxxxx,apple,True,False
1,is,VERB,VBZ,xx,be,True,True
2,looking,VERB,VBG,xxxx,look,True,False
3,at,ADP,IN,xx,at,True,True
4,buying,VERB,VBG,xxxx,buy,True,False
5,U.K.,PROPN,NNP,X.X.,u.k.,False,False
6,startup,NOUN,NN,xxxx,startup,True,False
7,for,ADP,IN,xxx,for,True,True
8,$,SYM,$,$,$,False,False
9,1,NUM,CD,d,1,False,False


In [30]:
#Step6 Dependency Parsing
doc = nlp(u'Rats are various medium-sized, long-tailed rodents.')
displacy.render(doc, style='dep')

ValueError: buffer source array is read-only