In [1]:
import pandas as pd

import numpy as np

import itertools

from nltk.tokenize import RegexpTokenizer

from nltk.stem.porter import *

import pickle

from scipy.optimize import linear_sum_assignment

In [2]:
stemmer = PorterStemmer()

In [3]:
# read input data
df = pd.read_csv("../data/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-train-fold-9.csv")

In [4]:
# separate into claims and headlines
claims, headlines = np.split(df[["claimHeadline", "articleHeadline"]].values, 2, axis=1)

In [5]:
# create a tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [6]:
# lower case and tokenize claims and headlines
claims_tok = [tokenizer.tokenize(claim.lower()) for claim in claims.flatten()]
headlines_tok = [tokenizer.tokenize(headline.lower()) for headline in headlines.flatten()]

In [7]:
# stem claims
claims_tok_stemmed = []
for claim in claims_tok:
    claim_tok_stemmed = []
    for tok in claim:
        claim_tok_stemmed.append(stemmer.stem(tok))
    claims_tok_stemmed.append(claim_tok_stemmed)

In [8]:
# stem headlines
headlines_tok_stemmed = []
for headline in headlines_tok:
    headline_tok_stemmed = []
    for tok in headline:
        headline_tok_stemmed.append(stemmer.stem(tok))
    headlines_tok_stemmed.append(headline_tok_stemmed)

In [9]:
# get all pairs of claim and headline tokens
pairs_tok = []
for claim_tok, headline_tok in zip(claims_tok, headlines_tok):
    pairs_tok.append(list(itertools.product(claim_tok, headline_tok)))

In [10]:
# get all pairs of claim and headline stemmed tokens
pairs_tok_stemmed = []
for claim_tok_stemmed, headline_tok_stemmed in zip(claims_tok_stemmed, headlines_tok_stemmed):
    pairs_tok_stemmed.append(list(itertools.product(claim_tok_stemmed, headline_tok_stemmed)))

In [11]:
# load the ppdb data
with open("../data/ppdb-2.0-xl-lexical.pkl", "rb") as f:
    ppdb_dict = pickle.load(f)

In [12]:
# function that takes in a pair of
# headline and claim tokens(stemmed and non-stemmed)
# and returns their ppdb score
def ppdb_func(stem_pair, token_pair, ppdb_dict, max_score=10, min_score=-10):
    paraphrase = ppdb_dict.get(token_pair[0], False)
    if stem_pair[0] == stem_pair[1]:
        return max_score
    elif paraphrase and token_pair[1] in paraphrase:
        ppdb_score = paraphrase[token_pair[1]][0]
        return ppdb_score
    else:
        return min_score

In [14]:
num_samples = len(pairs_tok)
alignment_feature = []

# iterate over all samples
for i in range(num_samples):
    # get tokens for current sample
    sample_tok = pairs_tok[i]
    sample_stem = pairs_tok_stemmed[i]
    num_pairs = len(sample_tok)

    # matrix of ppdb scores of each claim-headline token pair
    # each row represents a claim
    # each column represents a headline
    # score_matrix[n][m]: ppdb score for nth claim and mth headline for the ith sample
    score_matrix = []

    # iterate over all token pairs in sample
    for j in range(num_pairs):
        # get current pair
        stem_pair = sample_stem[j]
        token_pair = sample_tok[j]

        # when one claim token is done,
        # move to the next row of the score matrix
        if j%len(headlines_tok[i]) == 0:
            score_matrix.append([])
        # get ppdb score between the pair of claim and headline
        score_matrix[-1].append(ppdb_func(stem_pair, token_pair, ppdb_dict))
    # after scores between all pairs,
    # convert score matrix to a numpy array
    score_matrix = np.array(score_matrix)
    # compute the optimal assignment of pairs,
    # using the hungarian algorithm
    # here, use the negative of the score matrix because we want to maximize scores
    # (hungarian algorithm, by default tries to minimize the cost matrix)
    row_ind, col_ind = linear_sum_assignment(-score_matrix)
    
    # norm: min(length of claim, length of headline)
    norm = min(len(claims_tok[i]), len(headlines_tok[i]))
    # use the indices returned from the hungarian algorithm,
    # to get optimal assignment and sum to get score of max 1-1 alignment
    alignment_feature.append(score_matrix[row_ind, col_ind].sum()/norm)
    
    # TODO: neg feature from the stanfordnlp dependency parse
    # input(np.array(claims_tok[i])[row_ind])
    # input(np.array(headlines_tok[i])[col_ind])