Reads stories from the validation file. Each sentence has two possible outcomes: either the 5th or 6th sentence. The label (1 or 2) indicates which sentence contains the correct ending.

In [None]:
import pandas as pd
df = pd.read_csv('data/data_val.csv')
df.head()

vld set contains both true and fake stories, roc set only the true stories

In [509]:
import nltk

def read_data(file, data_type = 'val'):
    data = pd.read_csv(file)
    stories = []; labels = []
    for _, row in data.iterrows():
        story = {}
        if data_type == 'roc':
            story['ctx'] = [nltk.word_tokenize(sentence.lower()) for sentence in list(row[2:6])]
            story['ends'] = nltk.word_tokenize(row[6].lower())
            stories.append((story,))
            labels.append(+1)
        if data_type == 'val':
            story['ctx'] = [nltk.word_tokenize(sentence.lower()) for sentence in list(row[1:5])]
            story['ends'] = [nltk.word_tokenize(sentence.lower()) for sentence in list(row[5:7])]
            stories.append(story.copy())
            labels.append(row[7])
    return stories, np.array(labels)

In [510]:
stories, labels = read_data('data/data_val.csv')

Load the pre-trained embeddings

In [3]:
from gensim import models
model = models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)

In [249]:
import numpy as np
dim_embedding = 300

np.random.seed(10)
UKN = np.random.uniform(low=-0.25, high=0.25, size=dim_embedding)

def w2v(token):
    try:
        return model[token]
    except:
        return UKN

def centroid(tokens):
        if len(tokens) == 0:
            return np.zeros(shape = [dim_embedding,])
        else:
#             for token in tokens:
#                 print(token, w2v(token)[0:10])
            return np.mean([w2v(token) for token in tokens], axis = 0)
    
def cosine(vec1, vec2):
    if np.sum(vec1**2)*np.sum(vec2**2) == 0:
        return 0
    else:
        return np.sum(vec1*vec2)/np.sqrt(np.sum(vec1**2)*np.sum(vec2**2))

In [245]:
#remove dots, dashes and punctuation

Define features as in the paper

Centroid feature: concatenates the centroid of the context with the centroid of the ending

In [413]:
def centroid_feature(story):
    ctr_context = np.mean([centroid(sentence) for sentence in story['ctx']], axis = 0)
    return np.concatenate([ctr_context, centroid(story['end'])])

Average similarity: computes cosine distance between the centroid vector of the ending and vectors of the words in the context. Similarity scores of the words with top_N highest scores are returned as the feature

In [412]:
def average_sim(story, top_N = [1,2,3,5]):
    ctx_embeddings = [w2v(token) for token in list(set(sum(story['ctx'],[])))]
    words_similarity = sorted([cosine(embedding, centroid(list(set(story['end'])))) for embedding in ctx_embeddings], reverse = True)
    top_similarities = np.asarray([words_similarity[id] for id in top_N])
    return top_similarities

Max similarity: for each word in the context, chooses the most similar word from the ending and takes the average of all best word pair similarities

In [411]:
def max_sim(story):
    words_similarity = []
    for token in list(set(sum(story['ctx'],[]))):
        words_similarity.append(
                    np.max([cosine(w2v(token),w2v(token_end)) for token_end in story['end']])
                    )
    return [np.mean(words_similarity)]

For all combinations (POS1, POS2) in the context and ending computes centroid similarity between all words of type POS1 in the context and all words of type POS2 in the ending

In [414]:
def pos_sim(story, POS = ['VBZ','VBN','VBP','VBG','VBD','VB','RBS','RBR','RB','POS','NN','NNS','JJS','JJR','JJ']):
    POS_context = nltk.pos_tag(sum(story['ctx'],[]))
    POS_end = nltk.pos_tag(story['end'])

    pos_sim = []
    for pos1 in POS:
        for pos2 in POS:
            ctr_context = centroid([token for token, pos in POS_context if pos == pos1])
            ctr_end = centroid([token for token, pos in POS_end if pos == pos2])
            pos_sim.append(cosine(ctr_context, ctr_end))

    return np.array(pos_sim)

In [590]:
def get_features(stories):
#     pool = mp.Pool(n_cpus)
#     grid = pool.starmap_async(get_grid_r, [(theta, (Delta_x, Delta_y)) for theta in thetas]).get()
    
    X = []
    for item in stories:
        story = {}; features = []
        for end in item['ends']:
            story['ctx'] = item['ctx']
            story['end'] = end
            X_story = np.concatenate([feature(story) 
                                       for feature in [centroid_feature, average_sim, max_sim, pos_sim]])
            features.append(X_story)
        X.append(np.stack(features))
    return np.stack(X)

def get_y(labels):
    y = []
    for label in labels:
        y.append(np.array([1,0])) if label == 1 else y.append(np.array([0,1]))
    return np.stack(y)

In [506]:
def predict_labels(clf, X):
    labels = []
    for x in X:
        prob_true = clf.predict_proba(x)[:,1]
        labels.append(1) if np.argmax(prob_true) == 0 else labels.append(2)
    return np.array(labels)

In [592]:
X = get_features(stories)
y = get_y(labels)
stories_tst, labels_tst = read_data('data/data_test.csv')
Xts = get_features(stories_tst)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score

lg = LogisticRegression(C=2.5)
lg.fit(X.reshape([2*X.shape[0],-1]), y.reshape(-1))
labels_pred = predict_labels(lg, Xts)
print(f'Accuracy = {accuracy_score(labels_tst, labels_pred)}')

NameError: name 'X' is not defined