In [25]:
import pandas as pd
from nltk.tokenize import word_tokenize

In [26]:
from openie import StanfordOpenIE

In [27]:
# import os
# os.environ["CORENLP_HOME"] = r'D:\\Learning Material\\IR\\stanford-corenlp-full-2018-10-05\\stanford-corenlp-full-2018-10-05'

In [28]:
import string
chars = list(string.punctuation)
chars.remove("'")
punc="".join(chars)
table = str.maketrans('', '', punc)

In [29]:
#imports the data
roughDat= pd.read_csv("../../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean.csv")

In [30]:
#removes unnecessary columns.
roughDat = roughDat.drop(columns=['Unnamed: 0', 'claimId', 'articleHeadlineStance'])

In [31]:
claims = roughDat["claimHeadline"].tolist()
headlines = roughDat["articleHeadline"].tolist()
claimSVO=[]
headSVO=[]

In [33]:
with StanfordOpenIE() as client:
    for i in claims:
        claimSVO.append(client.annotate(i))

Starting server with command: java -Xmx8G -cp /Users/nikilsaldanaha/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-aa80561be52545a8.props -preload openie


In [34]:
with StanfordOpenIE() as client:
    for i in headlines:
        headSVO.append(client.annotate(i))

Starting server with command: java -Xmx8G -cp /Users/nikilsaldanaha/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-c847611fb3924949.props -preload openie


In [35]:
claim_subjects = []
claim_objects = []
claim_relations = []
for claim_svo in claimSVO:
    t1 = set()
    t2 = set()
    t3 = set()
    for a in claim_svo:
        t1.add(a["subject"])
        t2.add(a["object"])
        t3.add(a["relation"])
    claim_subjects.append(t1)
    claim_objects.append(t2)
    claim_relations.append(t3)

In [36]:
headline_subjects = []
headline_objects = []
headline_relations = []
for headline_svo in headSVO:
    t1 = set()
    t2 = set()
    t3 = set()
    for a in headline_svo:
        t1.add(a["subject"])
        t2.add(a["object"])
        t3.add(a["relation"])
    headline_subjects.append(t1)
    headline_objects.append(t2)
    headline_relations.append(t3)

In [37]:
subject_equivalence = []
for cs, hs in zip(claim_subjects, headline_subjects):
    claim_tokens = set()
    headline_tokens = set()
    for sub in cs:
        claim_tokens.update(word_tokenize(sub.lower()))
    for sub in hs:
        headline_tokens.update(word_tokenize(sub.lower()))
    subject_equivalence.append(len(claim_tokens.intersection(headline_tokens)) == 0)
    
object_equivalence = []
for cs, hs in zip(claim_objects, headline_objects):
    claim_tokens = set()
    headline_tokens = set()
    for sub in cs:
        claim_tokens.update(word_tokenize(sub.lower()))
    for sub in hs:
        headline_tokens.update(word_tokenize(sub.lower()))
    object_equivalence.append(len(claim_tokens.intersection(headline_tokens)) == 0)    

relation_equivalence = []
for cs, hs in zip(claim_relations, headline_relations):
    claim_tokens = set()
    headline_tokens = set()
    for sub in cs:
        claim_tokens.update(word_tokenize(sub.lower()))
    for sub in hs:
        headline_tokens.update(word_tokenize(sub.lower()))
    relation_equivalence.append(len(claim_tokens.intersection(headline_tokens)) == 0)        

In [38]:
import pickle

In [39]:
# load the ppdb data
with open("../../data/processed/ppdb/ppdb-small-all.pkl", "rb") as f:
    ppdb_dict = pickle.load(f)

In [40]:
subject_entailments = []
for cs, hs in zip(claim_subjects, headline_subjects):
    l = []
    entailment = ""
    for subject in cs:
        sub = word_tokenize(subject.lower().translate(table))
        for s in sub:
            l.append(ppdb_dict.get(s, {}))
    done = False
    for subject in hs:
        for para in l:
            sub = word_tokenize(subject.lower().translate(table))
            for s in sub:
                entailment=para.get(s,"noRelation")
                if entailment!="noRelation":
                    subject_entailments.append(entailment[1])
                    done = True
                    break
            if done is True:
                break
        if done is True:
            break
    if done is not True:
        subject_entailments.append("noRelation")

In [41]:
relation_entailments = []
for cr, hr in zip(claim_relations, headline_relations):
    l = []
    entailment = ""
    for subject in cr:
        rel = word_tokenize(subject.lower().translate(table))
        for r in rel:
            l.append(ppdb_dict.get(r, {}))
    done = False
    for relation in hr:
        for para in l:
            rel = word_tokenize(relation.lower().translate(table))
            for r in rel:
                entailment=para.get(r,"noRelation")
                if entailment!="noRelation":
                    relation_entailments.append(entailment[1])
                    done = True
                    break
            if done is True:
                break       
        if done is True:
            break
    if done is not True:
        relation_entailments.append("noRelation")
        

In [42]:
object_entailments = []
for co, ho in zip(claim_objects, headline_objects):
    l = []
    entailment = ""
    for obj in co:
        ob = word_tokenize(obj.lower().translate(table))
        for o in ob:
            l.append(ppdb_dict.get(o, {}))
    done = False
    for obj in ho:
        for para in l:
            ob = word_tokenize(obj.lower().translate(table))
            for o in ob:
                entailment=para.get(o,"noRelation")
                if entailment!="noRelation":
                    object_entailments.append(entailment[1])
                    done = True
                    break
            if done is True:
                break       
        if done is True:
            break
    if done is not True:
        object_entailments.append("noRelation")     
        

In [43]:
for i, (x,y) in enumerate(zip(subject_entailments, subject_equivalence)):
    if (x == "noRelation" or x == "Independent") and y is True:
        subject_entailments[i] = "Equivalence"

In [44]:
equiv_map = {
    "Independent": 0,
    "OtherRelated": 1,
    "Equivalence": 2,
    "ForwardEntailment": 3,
    "ReverseEntailment": 4,
    "Exclusion": 5,
    "noRelation": 0
}

In [45]:
object_entailments = [equiv_map[i] for i in object_entailments]
relation_entailments = [equiv_map[i] for i in relation_entailments]
subject_entailments = [equiv_map[i] for i in subject_entailments]

In [46]:
import numpy as np

In [47]:
assert len(subject_entailments) == roughDat["articleId"].shape[0]
assert len(relation_entailments) == roughDat["articleId"].shape[0]
assert len(object_entailments) == roughDat["articleId"].shape[0]

In [48]:
df = pd.DataFrame({
    "articleId": roughDat["articleId"],
    "subject_entailments": subject_entailments,
    "object_entailments": object_entailments,
    "relation_entailments": relation_entailments
})

In [49]:
df.to_csv("../../data/processed/features/svo_Lexical.csv", index=False)