In [1]:
from stanfordcorenlp import StanfordCoreNLP
from load_glove_matrix import Glove

In [2]:
# load pre-trained Glove vectors, 300d
glove_model = Glove()
glove_model.load_model()

Loading pretrained Glove vectors from file ./data/glove.840B.300d.txt
  --9.11%  loaded.
  --18.21%  loaded.
  --27.32%  loaded.
  --36.43%  loaded.
  --45.54%  loaded.
  --54.64%  loaded.
  --63.75%  loaded.
  --72.86%  loaded.
  --81.97%  loaded.
  --91.07%  loaded.
Finished loading Glove model. 2196017 vectors loaded


In [3]:
# find all unique words and part of speach
def find_words_pos(words, tags):
    uniqueWords = [] 
    posWords = []
    for i in words:
        if not i in uniqueWords:
            uniqueWords.append(i)
    for tag in tags:
        for word in uniqueWords:
            if word[1] == tag:
                posWords.append(word)
            else:
                continue
    return posWords

def find_words(words, trigger):
    l = []
    for word in words:
        if trigger not in ('word', 'pos'):
            print("Invalid trigger should be 'word' or 'pos'")
            break
        elif trigger == 'word':
            l.append(word[0])
        elif trigger == 'pos':
            l.append(word[1])
    return l

In [6]:
# make the twitter .csv as a .txt file and read all words from the file
dep = dict() # dep is the list of results after dependency parsing
pos = dict() # pos is the list of results after part-of-speech parsing
sentiment = dict()
print("Start preparing words...")
nlp = StanfordCoreNLP(r'C:\Users\sdzar\Documents\GitHub\CNIT581-NLT\data\stanford-corenlp-full-2018-02-27',memory='8g')
# this is the wiki corpus, you can change it to any other curpos you want
f= open("./data/parsing_results.xml","w+")
with open("./data/sentence.txt", "r") as file:
    # download CoreNLP 3.9.1 from https://stanfordnlp.github.io/CoreNLP/history.html
    for i, line in enumerate(file):
        if i == 2: break
        props={'annotators': 'pos,depparse,sentiment','pipelineLanguage':'en','outputFormat':'xml'}
        xml = nlp.annotate(line, properties=props)
        print (xml)
        f.write(xml)
        #dep[i] = nlp.dependency_parse(line)
        #pos[i] = nlp.pos_tag(line)
        # sentiment[i] = nlp.annotate(line, properties=sentiment)
        if (i+1) % 10 == 0:
            print("  --{}%  loaded.".format(round(i/200*100, 2)))
nlp.close() # Do not forget to close! The backend server will consume a lot memery.
f.close()
print("Done!")

Start preparing words...
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet href="CoreNLP-to-HTML.xsl" type="text/xsl"?>
<root><document><sentences><sentence id="1" sentimentValue="3" sentiment="Positive"><tokens><token id="1"><word>I</word><CharacterOffsetBegin>0</CharacterOffsetBegin><CharacterOffsetEnd>1</CharacterOffsetEnd><POS>PRP</POS><sentiment>Neutral</sentiment></token><token id="2"><word>love</word><CharacterOffsetBegin>2</CharacterOffsetBegin><CharacterOffsetEnd>6</CharacterOffsetEnd><POS>VBP</POS><sentiment>Very positive</sentiment></token><token id="3"><word>dogs</word><CharacterOffsetBegin>7</CharacterOffsetBegin><CharacterOffsetEnd>11</CharacterOffsetEnd><POS>NNS</POS><sentiment>Positive</sentiment></token><token id="4"><word>.</word><CharacterOffsetBegin>11</CharacterOffsetBegin><CharacterOffsetEnd>12</CharacterOffsetEnd><POS>.</POS><sentiment>Neutral</sentiment></token></tokens><parse>(ROOT&#x0D;
  (S&#x0D;
    (NP (PRP I))&#x0D;
    (VP (VBP love)&#x0D;
      (NP

In [5]:
# examples of results
for i in range(0,len(dep)):
    print("Dep in sentences {}: {}\n".format(i+1,dep[i]))
    print("Pos in sentences {}: {}\n".format(i+1,pos[i]))
    # print("Sentiment of each sentence {}: {}\n".format(i+1,sentiment[i]))

Dep in sentences 1: [('ROOT', 0, 2), ('nsubj', 2, 1), ('dobj', 2, 3), ('punct', 2, 4)]

Pos in sentences 1: [('I', 'PRP'), ('love', 'VBP'), ('dogs', 'NNS'), ('.', '.')]

Dep in sentences 2: [('ROOT', 0, 2), ('nsubj', 2, 1), ('dobj', 2, 3), ('punct', 2, 4)]

Pos in sentences 2: [('I', 'PRP'), ('hate', 'VBP'), ('puppies', 'NNS'), ('.', '.')]



In [4]:
# calculate the similarity between two words
def similarity(w1,w2):
    value = glove_model.similarity(w1,w2)
    return [w1,w2,value]

In [11]:
# testing for word similarity
list_test = []
list_test.append(similarity("love","hate"))
list_test.append(similarity("love","have"))
list_test.append(similarity("hate","have"))
list_test.append(similarity("dogs","puppies"))
for i in list_test:
    print("Similarity between '{}' and '{}' is {}".format(i[0],i[1],i[2]))
    #print(i)

Similarity between 'love' and 'hate' is 0.6393098823113967
Similarity between 'love' and 'have' is 0.5241775846462622
Similarity between 'hate' and 'have' is 0.4914875401567831
Similarity between 'dogs' and 'puppies' is 0.7978993201086827


In [None]:
# this part is for the CGT project(needs changes since pos is changed to dict())

tags_noun = ['NN',"NNP",'NNS','NNPS']
data_noun = find_words_pos(pos, tags_noun)
tags_verb = ['VB',"VBD",'VBG','VBN', 'VBP','VBZ']
data_verb = find_words_pos(pos, tags_verb)
tags_JJRB = ['JJ',"JJR", 'JJS','RB','RBR','RBS']
data_JJRB = find_words_pos(pos, tags_JJRB)
# word = find_words(data, 'word')
# tag = find_words(data, 'pos')
print("Noun: \n {}".format(data_noun))
print("Verb: \n {}".format(data_verb))
print("Adjective & adverb: \n {}".format(data_JJRB))