In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize

In [2]:
from openie import StanfordOpenIE

In [3]:
import os
os.environ["CORENLP_HOME"] = r'D:\\Learning Material\\IR\\stanford-corenlp-full-2018-10-05\\stanford-corenlp-full-2018-10-05'

In [4]:
import string
chars = list(string.punctuation)
chars.remove("'")
punc="".join(chars)
table = str.maketrans('', '', punc)

In [5]:
#imports the data
roughDat= pd.read_csv("../../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean.csv")

In [6]:
#removes unnecessary columns.
roughDat = roughDat.drop(columns=['Unnamed: 0', 'claimId', 'articleHeadlineStance'])

In [7]:
claims = roughDat["claimHeadline"].tolist()
headlines = roughDat["articleHeadline"].tolist()
claimSVO=[]
headSVO=[]

In [8]:
with StanfordOpenIE() as client:
    for i in claims:
        claimSVO.append(client.annotate(i))

Starting server with command: java -Xmx8G -cp C:\Users\adity\stanfordnlp_resources\stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-58868331b8e547fd.props -preload openie


In [9]:
with StanfordOpenIE() as client:
    for i in headlines:
        headSVO.append(client.annotate(i))

Starting server with command: java -Xmx8G -cp C:\Users\adity\stanfordnlp_resources\stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-556e19cd1c2949e1.props -preload openie


In [10]:
claim_subjects = []
claim_objects = []
claim_relations = []
for claim_svo in claimSVO:
    t1 = set()
    t2 = set()
    t3 = set()
    for a in claim_svo:
        t1.add(a["subject"])
        t2.add(a["object"])
        t3.add(a["relation"])
    claim_subjects.append(t1)
    claim_objects.append(t2)
    claim_relations.append(t3)

In [11]:
headline_subjects = []
headline_objects = []
headline_relations = []
for headline_svo in headSVO:
    t1 = set()
    t2 = set()
    t3 = set()
    for a in headline_svo:
        t1.add(a["subject"])
        t2.add(a["object"])
        t3.add(a["relation"])
    headline_subjects.append(t1)
    headline_objects.append(t2)
    headline_relations.append(t3)

In [12]:
subject_equivalence = []
for cs, hs in zip(claim_subjects, headline_subjects):
    claim_tokens = set()
    headline_tokens = set()
    for sub in cs:
        claim_tokens.update((sub.lower()))
    for sub in hs:
        headline_tokens.update((sub.lower()))
    subject_equivalence.append(len(claim_tokens.intersection(headline_tokens)) == 0)
    
object_equivalence = []
for cs, hs in zip(claim_objects, headline_objects):
    claim_tokens = set()
    headline_tokens = set()
    for sub in cs:
        claim_tokens.update((sub.lower()))
    for sub in hs:
        headline_tokens.update((sub.lower()))
    object_equivalence.append(len(claim_tokens.intersection(headline_tokens)) == 0)    

relation_equivalence = []
for cs, hs in zip(claim_relations, headline_relations):
    claim_tokens = set()
    headline_tokens = set()
    for sub in cs:
        claim_tokens.update((sub.lower()))
    for sub in hs:
        headline_tokens.update((sub.lower()))
    relation_equivalence.append(len(claim_tokens.intersection(headline_tokens)) == 0)    

In [13]:
import pickle

In [14]:
# load the ppdb data
with open("../../data/external/ppdb-small-all.pkl", "rb") as f:
    ppdb_dict = pickle.load(f)

In [15]:
subject_entailments = []
for cs, hs in zip(claim_subjects, headline_subjects):
    l = []
    entailment = ""
    for subject in cs:
        sub = subject.lower().translate(table)
        l.append(ppdb_dict.get(sub, {}))
    done = False
    for subject in hs:
        for para in l:
            entailment=para.get(subject,"noRelation")
            if entailment!="noRelation":
                subject_entailments.append(entailment[1])
                done = True
                break
        if done is True:
            break
    if done is not True:
        subject_entailments.append("noRelation")

In [16]:
relation_entailments = []
for cr, hr in zip(claim_relations, headline_relations):
    l = []
    entailment = ""
    for relation in cr:
        rel = relation.lower().translate(table)
        l.append(ppdb_dict.get(rel, {}))
    done = False
    for relation in hr:
        for para in l:
            entailment = para.get(relation,"noRelation")
            if entailment != "noRelation":
                relation_entailments.append(entailment[1])
                done = True
                break
        if done is True:
            break
    if done is not True:
        relation_entailments.append("noRelation")

In [19]:
object_entailments = []
for cr, hr in zip(claim_objects, headline_objects):
    l = []
    entailment = ""
    for obj in cr:
        rel = obj.lower().translate(table)
        l.append(ppdb_dict.get(rel, {}))
    done = False
    for obj in hr:
        for para in l:
            entailment = para.get(obj,"noRelation")
            if entailment != "noRelation":
                object_entailments.append(entailment[1])
                done = True
                break
        if done is True:
            break
    if done is not True:
        object_entailments.append("noRelation")

In [20]:
for i, (x,y) in enumerate(zip(subject_entailments, subject_equivalence)):
    if (x == "noRelation" or x == "Independent") and y is True:
        subject_entailments[i] = "Equivalence"
        
for i, (x,y) in enumerate(zip(relation_entailments, relation_equivalence)):
    if (x == "noRelation" or x == "Independent") and y is True:
        relation_entailments[i] = "Equivalence"
        
for i, (x,y) in enumerate(zip(object_entailments, object_equivalence)):
    if (x == "noRelation" or x == "Independent") and y is True:
        object_entailments[i] = "Equivalence"            

In [21]:
equiv_map = {
    "Independent": 0,
    "OtherRelated": 1,
    "Equivalence": 2,
    "ForwardEntailment": 3,
    "ReverseEntailment": 4,
    "Exclusion": 5,
    "noRelation": 0
}

In [22]:
object_entailments = [equiv_map[i] for i in object_entailments]
relation_entailments = [equiv_map[i] for i in relation_entailments]
subject_entailments = [equiv_map[i] for i in subject_entailments]

In [23]:
import numpy as np

In [24]:
assert len(subject_entailments) == roughDat["articleId"].shape[0]
assert len(relation_entailments) == roughDat["articleId"].shape[0]
assert len(object_entailments) == roughDat["articleId"].shape[0]

In [25]:
df = pd.DataFrame({
    "articleId": roughDat["articleId"],
    "subject_entailments": subject_entailments,
    "object_entailments": object_entailments,
    "relation_entailments": relation_entailments
})

In [30]:
df.to_csv("../../data/processed/features/svo_ppdb_s_all.csv", index=False)