In [70]:
import spacy
from spacy.lang.en.examples import sentences
import pandas as pd
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
#from nltk.stem import PorterStemmer


def parser(text, nlp, columns = ['text','pos','tag','shape', 'lemma', 'alpha','stop']):
    doc = nlp(text)
    features = []
    for token in doc:
        features.append (
            {
                'text':token.text,
                #'lemma': token.lemma_,
                'pos': token.pos_,
                'tag': token.tag_,
                'shape': token.shape_,
                'lemma':token.lemma_, 
                'alpha':token.is_alpha,
                'stop' : token.is_stop

                #'dep': token.dep_
            }
        )
    return pd.DataFrame(features, columns = columns)


In [65]:
#Step1: sentence Segmentation
doc = nlp(u"London is the capital and most populous city of England and United Kingdom."
          u"Standing on the river thames in the south east corner of the island of Great Britain, London has been a major settlement for two millennia. It was founded by Romans who named it Londinium."
          u"London's ancient core, the city of London, largely retains its 1.12-square-mile medieval boundaries.")
for sent in doc.sents:
    print(sent.text)
    print('\n')
    

London is the capital and most populous city of England and United Kingdom.


Standing on the river thames in the south east corner of the island of Great Britain, London has been a major settlement for two millennia.


It was founded by Romans who named it Londinium.


London's ancient core, the city of London, largely retains its 1.12-square-mile medieval boundaries.




In [45]:
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == '...':
            doc[token.i+1].is_sent_start = True
    return doc

In [60]:
#Sentence Segmentation with custom biundaries... Before
text = u"This is a sentence.hello...this is another sentence."
doc = nlp(text)

print('Before:') #, [sent.text for sent in doc.sents])
for sent in doc.sents:
    print(sent.text)

    

Before:
This is a sentence.hello...this is another sentence.


In [61]:
#Sentence Segmentation with custom biundaries... After

#if not nlp.has_pipe('custom_set_boundaries'):
nlp.add_pipe(set_custom_boundaries,name='custom_set_boundaries', before='parser')


doc = nlp(text)
print('After:') #, [sent.text for sent in doc.sents])
for sent in doc.sents:
    print(sent.text)

    
    
if ( nlp.has_pipe('custom_set_boundaries')):
    nlp.remove_pipe('custom_set_boundaries')

After:
This is a sentence.hello...this is another sentence.


In [11]:
#Step2 - Tokenization
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)
    

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [26]:
#Step3 POS Tagging
text= "Apple is looking at buying U.K. startup for $1 billion"
parser(text, nlp, columns= ['text', 'pos','tag', 'shape', 'alpha'])
    #print(token.text, token.pos_, token.tag_ )

Unnamed: 0,text,pos,tag,shape,alpha
0,Apple,PROPN,NNP,Xxxxx,True
1,is,VERB,VBZ,xx,True
2,looking,VERB,VBG,xxxx,True
3,at,ADP,IN,xx,True
4,buying,VERB,VBG,xxxx,True
5,U.K.,PROPN,NNP,X.X.,False
6,startup,NOUN,NN,xxxx,True
7,for,ADP,IN,xxx,True
8,$,SYM,$,$,False
9,1,NUM,CD,d,False


In [27]:
#step4 lemmatization

text= "Apple is looking at buying U.K. startup for $1 billion"
parser(text, nlp, columns= ['text', 'pos','tag', 'shape', 'alpha', 'lemma'])


Unnamed: 0,text,pos,tag,shape,alpha,lemma
0,Apple,PROPN,NNP,Xxxxx,True,apple
1,is,VERB,VBZ,xx,True,be
2,looking,VERB,VBG,xxxx,True,look
3,at,ADP,IN,xx,True,at
4,buying,VERB,VBG,xxxx,True,buy
5,U.K.,PROPN,NNP,X.X.,False,u.k.
6,startup,NOUN,NN,xxxx,True,startup
7,for,ADP,IN,xxx,True,for
8,$,SYM,$,$,False,$
9,1,NUM,CD,d,False,1


In [28]:
#Step 5: Identifying Stop Words
text= "Apple is looking at buying U.K. startup for $1 billion"
parser(text, nlp)


Unnamed: 0,text,pos,tag,shape,lemma,alpha,stop
0,Apple,PROPN,NNP,Xxxxx,apple,True,False
1,is,VERB,VBZ,xx,be,True,True
2,looking,VERB,VBG,xxxx,look,True,False
3,at,ADP,IN,xx,at,True,True
4,buying,VERB,VBG,xxxx,buy,True,False
5,U.K.,PROPN,NNP,X.X.,u.k.,False,False
6,startup,NOUN,NN,xxxx,startup,True,False
7,for,ADP,IN,xxx,for,True,True
8,$,SYM,$,$,$,False,False
9,1,NUM,CD,d,1,False,False


In [63]:
#Step6 Dependency Parsing
def parseTree(text, nlp):
    doc = nlp(text)
    features = []
    for token in doc:
         features.append (
            {
                            
                #'tag': token.tag_,                
                'children': [child for child in token.children],
                'head.pos': token.head.pos_,
                'head': token.head.text,
                'dep': token.dep_,
                'text':token.text,    
                
            }
        )
    return pd.DataFrame(features)[['text','dep','head','head.pos','children']]

text = """Apple opens new store in U.A.E Dubai city Saturday""" # Dependencies
parseTree(text, nlp)

Unnamed: 0,text,dep,head,head.pos,children
0,Apple,nsubj,opens,VERB,[]
1,opens,ROOT,opens,VERB,"[Apple, store, Saturday]"
2,new,amod,store,NOUN,[]
3,store,dobj,opens,VERB,"[new, in]"
4,in,prep,store,NOUN,[city]
5,U.A.E,compound,Dubai,PROPN,[]
6,Dubai,compound,city,NOUN,[U.A.E]
7,city,pobj,in,ADP,[Dubai]
8,Saturday,npadvmod,opens,VERB,[]


In [86]:
#Step - Word Vectors and semantic similarity

def semanticParser(text, nlp, columns):
    doc = nlp(text)
    features = []
    for token1 in doc:
        for token2 in doc:
             features.append (
                {
                            
                    'token1.text':token1.text,  
                    'token2.text':token2.text,
                    'tokens.similarity%':token1.similarity(token2)
                
                
                }
            )
    return pd.DataFrame(features)

text="""dog cat banana"""
semanticParser(text,nlp, columns= ['token1.text', 'token2.text','tokens.similarity'])


Unnamed: 0,token1.text,token2.text,tokens.similarity%
0,dog,dog,1.0
1,dog,cat,0.53907
2,dog,banana,0.28761
3,cat,dog,0.53907
4,cat,cat,1.0
5,cat,banana,0.487522
6,banana,dog,0.28761
7,banana,cat,0.487522
8,banana,banana,1.0


In [98]:
def findVectorNorm(text, nlp, columns):
    doc = nlp(text)
    features = []
    for token in doc:
             features.append (
                {
                            
                    'token_has_vector':token.has_vector,  
                    'token_vector_norm':token.vector_norm,
                    'token_isoov':token.is_oov,
                    'text':token.text
                
                
                }
            )
    return pd.DataFrame(features)



text = """dog cat banana gggddd"""
findVectorNorm(text, nlp, columns=['token.text', 'token.vector_norm', 'token.is_oov' , 'token.has_vector' ])


Unnamed: 0,text,token_has_vector,token_isoov,token_vector_norm
0,dog,True,True,23.863817
1,cat,True,True,24.438457
2,banana,True,True,24.115486
3,gggddd,True,True,22.816465


In [110]:
#step7 Named Entity Recognition


def findNER(doc, nlp, columns):

    features = []
    for ent in doc.ents:
             features.append (
                {
                            
                    'ent.text':ent.text,  
                    'ent.type':ent.label_
                
                }
            )
    return pd.DataFrame(features)

doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

findNER(doc,nlp, columns=[ent.text, ent.label_])




Unnamed: 0,ent.text,ent.type
0,Apple,ORG
1,U.K.,GPE
2,$1 billion,MONEY


In [111]:
#step8 co reference resolution
doc = nlp(u"London is the capital and most populous city of England and United Kingdom."
          u"Standing on the river thames in the south east corner of the island of Great Britain, London has been a major settlement for two millennia. It was founded by Romans who named it Londinium."
          u"London's ancient core, the city of London, largely retains its 1.12-square-mile medieval boundaries.")

findNER(doc,nlp, columns=[ent.text, ent.label_])

Unnamed: 0,ent.text,ent.type
0,London,GPE
1,England,GPE
2,United Kingdom,GPE
3,Great Britain,GPE
4,London,GPE
5,two millennia,DATE
6,Romans,NORP
7,Londinium,PERSON
8,London,GPE
9,London,GPE
