In [27]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import spacy
from spacy import displacy

In [2]:
class Article:
    def __init__(self, id, title, content, link):
        self.id = id
        self.title = title
        self.content = content
        self.link = link

In [10]:
def fetchArticles():
    files = ["xaa","xab","xac","xad","xae"]
    articles = {}
    for filename in files:
        with open(filename) as data_file:
            fileData = data_file.readlines()
        for articleData in fileData:
            articleComponents = articleData.split("\t")
            articles[articleComponents[0]] = (Article(articleComponents[0], articleComponents[1], articleComponents[2], articleComponents[3]))
            
    return articles

In [24]:
nlp = spacy.load('en_core_web_sm')
SPACY_DEP_ROOT = "ROOT"
SPACY_DEP_NSUBJ = "nsubj"
SPACY_DEP_NSUBJ_PASS = "nsubjpass"
SPACY_DEP_DOBJ = "dobj"
SPACY_DEP_PREP = "prep"
SPACY_DEP_PREP_OBJ = "pobj"
SPACY_DEP_CCOMP = "ccomp"
SPACY_DEP_IND_OBJ_1 = "dative"
SPACY_DEP_IND_OBJ_2 = "iobj"

SPACY_TAG_PREP = "IN"
SPACY_TAG_NN = "NN"
SPACY_TAG_NNS = "NNS"
SPACY_TAG_NNP = "NNP"
SPACY_TAG_NNPS = "NNPS"
SPACY_TAG_JJ = "JJ"
SPACY_TAG_JJR = "JJR"
SPACY_TAG_JJS = "JJS"
SPACY_TAG_VB = "VB"
SPACY_TAG_VBD = "VBD"
SPACY_TAG_VBG = "VBG"
SPACY_TAG_VBN = "VBN"
SPACY_TAG_VBP = "VBP"
SPACY_TAG_VBZ = "VBZ"

SPACY_NER_MONEY = "MONEY"
SPACY_NER_CARDINAL = "CARDINAL"

PREP_FOR = "for"
PREP_TO = "to"
PREP_FROM = "from"
PREP_OVER = "over"
PREP_UNTIL = "until"
PREP_AFTER = "after"
PREP_AS = "as"

def getSubTreeString(token):
    return ' '.join([str(token1.text) for token1 in list(token.subtree)])

def getHeadOfSentence(sentence):

    doc = nlp(str(sentence))
    head = None
    for token in doc:
        if token.dep_ == SPACY_DEP_ROOT:
            #print ("head",sentence, token.text)
            head = token
            
    return head  

def printTree(sentence):
    doc = nlp(str(sentence))
 
    for token in doc:
        print("{5}: {0}/{1} <--{2}-- {3}/{4} > {6};  type {7}".format(
           token.text, token.tag_, token.dep_, token.head.text, token.head.tag_,token.i,token.head.i, token.ent_id_))
        
        print(list(token.subtree))
        print ('**********')
    
def printNamedEntities(sentence):
    doc = nlp(sentence)
    print ("Named entities\n")
    for ent in list(doc.ents):
        print(ent.text, ent.label_)
        
def displayTree(sentence):
    doc = nlp(str(sentence))
    displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

In [25]:
def pipeline(records):
    wordnet_lemmatizer = WordNetLemmatizer() 
    for record in records:
        sentences = nltk.sent_tokenize(record)
        for sentence in sentences:
            words = nltk.word_tokenize(sentence)
            print(sentence)
            print(nltk.pos_tag(sentence))
            displayTree(sentence)
            for word in words:
                print("Word: ", word)
                print("Lemma: ", wordnet_lemmatizer.lemmatize(word))
                for synset in wordnet.synsets(word):
                    print("\nHYPERNYMS\n")
                    print (synset, synset.hypernyms())
                    print("\nHYPONYMS\n")
                    print (synset, synset.hyponyms())
                    print("\MERONYMS\n")
                    print (synset, synset.part_meronyms())
                    print (synset, synset.substance_meronyms())
                    print("\nHOLONYMS\n")
                    print (synset, synset.part_holonyms())
                    print (synset, synset.substance_holonyms())
          
    print("*********")

In [28]:
# Printing Sentences and words
articles = fetchArticles()
allTitles = [article.title for article in articles.values()]
titles = ["Sachin was paid 1000$"]
pipeline(titles)


Sachin was paid 1000$
[('S', 'VB'), ('a', 'DT'), ('c', 'NN'), ('h', 'NN'), ('i', 'JJ'), ('n', 'VBP'), (' ', 'JJ'), ('w', 'VBP'), ('a', 'DT'), ('s', 'NN'), (' ', 'NNP'), ('p', 'VBZ'), ('a', 'DT'), ('i', 'JJ'), ('d', 'NN'), (' ', 'VBD'), ('1', 'CD'), ('0', 'CD'), ('0', 'CD'), ('0', 'CD'), ('$', '$')]


Word:  Sachin
Lemma:  Sachin
Word:  was
Lemma:  wa

HYPERNYMS

Synset('washington.n.02') []

HYPONYMS

Synset('washington.n.02') []
\MERONYMS

Synset('washington.n.02') [Synset('aberdeen.n.01'), Synset('adams.n.04'), Synset('bellingham.n.01'), Synset('cape_flattery.n.01'), Synset('columbia.n.01'), Synset('inland_passage.n.01'), Synset('kennewick.n.01'), Synset('lake_chelan.n.01'), Synset('mount_ranier_national_park.n.01'), Synset('mount_saint_helens.n.01'), Synset('north_cascades_national_park.n.01'), Synset('olympia.n.01'), Synset('olympic_national_park.n.01'), Synset('pacific_northwest.n.01'), Synset('puget_sound.n.01'), Synset('ranier.n.01'), Synset('scablands.n.01'), Synset('seattle.n.01'), Synset('snake.n.03'), Synset('spokane.n.01'), Synset('tacoma.n.01'), Synset('vancouver.n.02'), Synset('walla_walla.n.01'), Synset('yakima.n.01')]
Synset('washington.n.02') []

HOLONYMS

Synset('washington.n.02') [Synset('united_states.n.01')]
Synset('washington.n.02') []

HYPERNYMS

Synset('be.v

In [12]:
fd = open('allTitles', 'w')
for sentence in allTitles:
    fd.write(sentence)
    fd.write('\n')
fd.close()

In [None]:
#wc -w allTitles