In [1]:
import pandas as pd

import numpy as np

import itertools

from nltk.tokenize import RegexpTokenizer

from nltk.stem.porter import *

import pickle

from scipy.optimize import linear_sum_assignment
from stanfordnlp.server import CoreNLPClient 
import os

In [2]:
from nltk import word_tokenize

In [3]:
from stanfordnlp.server import CoreNLPClient

In [4]:
stemmer = PorterStemmer()

In [5]:
# read input data
df = pd.read_csv("../../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean.csv")

In [6]:
# separate into claims and headlines
claims, headlines = np.split(df[["claimHeadline", "articleHeadline"]].values, 2, axis=1)

In [7]:
# lower case and remove full stops
claims = [re.sub(r"([\.\?\!]+$|[\.\?\!]+\ )", "", claim.lower()) for claim in claims.flatten().tolist()]
headlines = [re.sub(r"([\.\?\!]+$|[\.\?\!]+\ )", "", headline.lower()) for headline in headlines.flatten().tolist()]

In [8]:
claims_unique = [re.sub(r"[\.\?\!]+$", "", claim.lower()) for claim in df["claimHeadline"].unique().tolist()]


In [9]:
# tokenize claims and headlines
claims_tok = [word_tokenize(claim) for claim in claims]
headlines_tok = [word_tokenize(headline) for headline in headlines]

In [10]:
# stem claims
claims_tok_stemmed = []
for claim in claims_tok:
    claim_tok_stemmed = []
    for tok in claim:
        claim_tok_stemmed.append(stemmer.stem(tok))
    claims_tok_stemmed.append(claim_tok_stemmed)

In [11]:
# stem headlines
headlines_tok_stemmed = []
for headline in headlines_tok:
    headline_tok_stemmed = []
    for tok in headline:
        headline_tok_stemmed.append(stemmer.stem(tok))
    headlines_tok_stemmed.append(headline_tok_stemmed)

In [12]:
# get all pairs of claim and headline tokens
pairs_tok = []
for claim_tok, headline_tok in zip(claims_tok, headlines_tok):
    pairs_tok.append(list(itertools.product(claim_tok, headline_tok)))

In [13]:
# get all pairs of claim and headline stemmed tokens
pairs_tok_stemmed = []
for claim_tok_stemmed, headline_tok_stemmed in zip(claims_tok_stemmed, headlines_tok_stemmed):
    pairs_tok_stemmed.append(list(itertools.product(claim_tok_stemmed, headline_tok_stemmed)))

In [14]:
# load the ppdb data
with open("../../data/processed/ppdb/ppdb-small-all.pkl", "rb") as f:
    ppdb_dict = pickle.load(f)

In [15]:
# function that takes in a pair of
# headline and claim tokens(stemmed and non-stemmed)
# and returns their ppdb score
def ppdb_func(stem_pair, token_pair, ppdb_dict, max_score=10, min_score=-10):
    paraphrase = ppdb_dict.get(token_pair[0], False)
    if stem_pair[0] == stem_pair[1]:
        return max_score
    elif paraphrase and token_pair[1] in paraphrase:
        ppdb_score = paraphrase[token_pair[1]][0]
        return ppdb_score
    else:
        return min_score

In [16]:
num_samples = len(pairs_tok)
alignment_feature = []
pwC=[]
pwH=[]
iC=[]
iH=[]
# iterate over all samples
for i in range(num_samples):
    # get tokens for current sample
    sample_tok = pairs_tok[i]
    sample_stem = pairs_tok_stemmed[i]
    num_pairs = len(sample_tok)

    # matrix of ppdb scores of each claim-headline token pair
    # each row represents a claim
    # each column represents a headline
    # score_matrix[n][m]: ppdb score for nth claim and mth headline for the ith sample
    score_matrix = []

    # iterate over all token pairs in sample
    for j in range(num_pairs):
        # get current pair
        stem_pair = sample_stem[j]
        token_pair = sample_tok[j]

        # when one claim token is done,
        # move to the next row of the score matrix
        if j%len(headlines_tok[i]) == 0:
            score_matrix.append([])
        # get ppdb score between the pair of claim and headline
        score_matrix[-1].append(ppdb_func(stem_pair, token_pair, ppdb_dict))
    # after scores between all pairs,
    # convert score matrix to a numpy array
    score_matrix = np.array(score_matrix)
    # compute the optimal assignment of pairs,
    # using the hungarian algorithm
    # here, use the negative of the score matrix because we want to maximize scores
    # (hungarian algorithm, by default tries to minimize the cost matrix)
    row_ind, col_ind = linear_sum_assignment(-score_matrix)
    
    # norm: min(length of claim, length of headline)
    norm = min(len(claims_tok[i]), len(headlines_tok[i]))
    # use the indices returned from the hungarian algorithm,
    # to get optimal assignment and sum to get score of max 1-1 alignment
    alignment_feature.append(score_matrix[row_ind, col_ind].sum()/norm)
    
    # TODO: neg feature from the stanfordnlp dependency parse
    pwC.append(np.array(claims_tok[i])[row_ind])
    pwH.append(np.array(headlines_tok[i])[col_ind])
    iC.append(row_ind)
    iH.append(col_ind)
    

In [17]:
# os.environ["CORENLP_HOME"] = r'D:\\Learning Material\\IR\\stanford-corenlp-full-2018-10-05\\stanford-corenlp-full-2018-10-05'

In [18]:
edgelistC=[]

with CoreNLPClient(annotators=['tokenize','ssplit','pos','depparse'], timeout=60000, memory='16G') as client:
    for claim in claims:
        claimc = client.annotate(claim)
        edges= claimc.sentence[0].basicDependencies.edge
        edgelistC.append(edges)

Starting server with command: java -Xmx16G -cp /Users/nikilsaldanaha/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-b89c3de0f9f245db.props -preload tokenize,ssplit,pos,depparse


In [19]:
edgelistH=[]

with CoreNLPClient(annotators=['tokenize','ssplit','pos','depparse'], timeout=60000, memory='16G') as client:
    for headline in headlines:
        headlinesh = client.annotate(headline)
        edges= headlinesh.sentence[0].basicDependencies.edge
        edgelistH.append(edges)

Starting server with command: java -Xmx16G -cp /Users/nikilsaldanaha/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-8f47016e10da4c1d.props -preload tokenize,ssplit,pos,depparse


In [20]:
i=0
neg_features = []
for claim_edges, headline_edges in zip(edgelistC, edgelistH):
    neg_feature = 0
    for c, h in zip(iC[i], iH[i]):
        for e in claim_edges:
            if (e.source==c)and(e.dep=="neg"):
                neg_feature = 1
                break
                
        for f in headline_edges:
            if (f.source==h)and(f.dep=="neg"):
                if neg_feature == 0:
                    neg_feature = 1
                else:
                    neg_feature = 0
                break
        if neg_feature == 1:
            break
    neg_features.append(neg_feature)
    i += 1

In [21]:
neg_alignment_feature_df = pd.DataFrame({"articleId": df["articleId"].tolist(), "negAlignmentScore": neg_features})

In [22]:
alignment_feature_df = pd.DataFrame({"articleId": df["articleId"].tolist(), "alignmentScore": alignment_feature})

In [23]:
alignment_feature_df.to_csv("../../data/processed/features/alignment_feature.csv", index=False)
neg_alignment_feature_df.to_csv("../../data/processed/features/neg_alignment_feature.csv", index=False)