##TextGraphs

###Preprocessing

In [None]:
# Setup
from pathlib import Path
import spacy 
import pandas as pd
import sys
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import re

path_data = "tg2019task/worldtree_corpus_textgraphs2019sharedtask_withgraphvis"
if not Path(path_data).exists():
    # Download data
    !git clone -q https://github.com/umanlp/tg2019task.git
    !cd tg2019task/ && make dataset
    # Run baseline tfidf (expected MAP: 0.054)
    # The baseline only submits the top n=10 sentences per question by default
    # Increasing n to 5000 results in MAP >= 0.24 (cut short due to slow processing)
    !cd {path_data} && python ../baseline_tfidf.py annotation/expl-tablestore-export-2017-08-25-230344/tables questions/ARC-Elementary+EXPL-Dev.tsv > predict.txt
    !cd {path_data} && python ../evaluate.py --gold=questions/ARC-Elementary+EXPL-Dev.tsv predict.txt

sys.path.append("tg2019task")
from baseline_tfidf import read_explanations

In [None]:
# Embedding and Spacy tokenizer
def embed_texts(texts):
    # Wrap with Keras model for convenient batching and progress bar
    # Adapted from "Keras + Universal Sentence Encoder = Transfer Learning for text data"
    texts = np.asarray(texts)
    tf.keras.backend.clear_session()
    
    module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
    embed = hub.Module(module_url)
    
    def UniversalEmbedding(x):
        return embed(tf.squeeze(tf.cast(x, tf.string)), 
            signature="default", as_dict=True)["default"]
    
    inp = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
    out = tf.keras.layers.Lambda(UniversalEmbedding)(inp)
    model = tf.keras.Model(inp, out)
    
    with tf.Session() as sess:
        tf.keras.backend.set_session(sess)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
    
        embeds = model.predict(texts, batch_size=128, verbose=True)
        
    tf.keras.backend.clear_session()  # avoid session closed error
    return embeds

def test_embed_texts():
    print(embed_texts(["hello", "bye"]))
    
test_embed_texts()
    
def preprocess_texts(texts, remove_stop=True, remove_punct=True):
    nlp = spacy.load("en_core_web_sm")
    tokens = []
    lemmas = []
    for doc in nlp.pipe(texts, disable=["ner", "tagger", "parser"]):
        _tokens = []
        _lemmas = []
        for token in doc:
            if token.is_stop and remove_stop:
                continue
            if token.is_punct and remove_punct:
                continue
            _tokens.append(token.text)
            _lemmas.append(token.lemma_)
        tokens.append(_tokens)
        lemmas.append(_lemmas)

    return tokens, lemmas

def test_preprocess_texts():
    print(preprocess_texts(["Which of these will most likely increase?", "Habitats support animals."]))
    
test_preprocess_texts()

In [None]:
# Extract fact sentences
path_tables = Path(path_data).joinpath("annotation/expl-tablestore-export-2017-08-25-230344/tables")

def get_df_explanations(path_tables):
    explanations = []
    for p in path_tables.iterdir():
        explanations += read_explanations(str(p))
    df = pd.DataFrame(explanations, columns=("uid", "text"))  # 3 duplicate uids
    df = df.drop_duplicates("uid")
    tokens, lemmas = preprocess_texts(df.text)
    embeds = [row for row in embed_texts(df.text.tolist())]
    df["tokens"], df["lemmas"], df["embedding"] = tokens, lemmas, embeds
    print(df.shape)
    return df

df_exp = get_df_explanations(path_tables)
uid2idx = {uid: idx for idx, uid in enumerate(df_exp.uid.values)}
print(list(uid2idx.items())[:3])
df_exp.head()

In [None]:
# Format question examples
path_questions = Path(path_data).joinpath("questions")

def extract_explanation(exp_string):
    if type(exp_string) != str:
        return [], []
    uids = []
    roles = []
    for uid_and_role in exp_string.split():
        uid, role = uid_and_role.split("|")
        uids.append(uid)
        roles.append(role)
    return uids, roles

def split_question(q_string):
    # split on option parantheses eg "(A)"
    return re.compile("\(.\)").split(q_string)

def test_split_question():
    print(split_question('Which process? (A) flying (B) talking (C) seeing (D) reproducing (E) something'))
    print(split_question('Which process? (A) flying (B) talking (C) seeing (D) reproducing'))
    print(split_question('Which process? (A) flying (B) talking (C) seeing'))
    
test_split_question()

def get_questions(path_questions, fname):
    df = pd.read_csv(Path(path_questions).joinpath(fname), sep="\t")
    
    # Reformat question
    q_reformat = []
    questions = df.Question.values
    answers = df["AnswerKey.1"].values
    char2idx = {char: idx for idx, char in enumerate(list("ABCDE"))}
    
    for i in range(len(df)):
        q, *options = split_question(questions[i])
        idx_option = char2idx[answers[i]]
        q_reformat.append(" ".join([q.strip(), options[idx_option].strip()]))
    df["q_reformat"] = q_reformat

    # Preprocess texts
    tokens, lemmas = preprocess_texts(df.q_reformat)
    embeds = [row for row in embed_texts(df.q_reformat.tolist())]
    df["tokens"], df["lemmas"], df["embedding"] = tokens, lemmas, embeds

    # Get explanation uids and roles
    exp_uids = []
    exp_roles = []
    exp_idxs = []
    for exp_string in df.explanation.values:
        _uids, _roles = extract_explanation(exp_string)
        uids = []
        roles = []
        idxs = []
        assert len(_uids) == len(_roles)
        for i in range(len(_uids)):
            if _uids[i] not in uid2idx:
                continue
            uids.append(_uids[i])
            roles.append(_roles[i])
            idxs.append(uid2idx[_uids[i]])
        exp_uids.append(uids)
        exp_roles.append(roles)
        exp_idxs.append(idxs)
    df["exp_uids"], df["exp_roles"], df["exp_idxs"] = exp_uids, exp_roles, exp_idxs

    print(df.shape)
    return df

df_trn = get_questions(path_questions, "ARC-Elementary+EXPL-Train.tsv")
df_dev = get_questions(path_questions, "ARC-Elementary+EXPL-Dev.tsv")
df_test = get_questions(path_questions, "ARC-Elementary+EXPL-Test-Masked.tsv")
df_trn.sample(3, random_state=42)

In [None]:
# Build unique lemma/token data
def flatten(nested_list):
    return [item for lst in nested_list for item in lst]

def get_flattened_items(dfs, field):
    all_items = []
    for df in dfs:
        all_items.extend(flatten(df[field]))
    print(len(all_items))
    return all_items

all_lemmas = get_flattened_items([df_trn, df_dev, df_test, df_exp], "lemmas")
# all_tokens = get_flattened_items([df_trn, df_dev, df_test, df_exp], "tokens")
    
unique_lemmas = sorted(list(set(all_lemmas)))
print(len(unique_lemmas))
lemma2id = {lemma:idx for idx, lemma in enumerate(unique_lemmas)}
df_lemma = pd.DataFrame({
    "node": unique_lemmas,
    "embedding": [row for row in embed_texts(unique_lemmas)]
})

# Extract lemma node ids for every sentence
def get_nodes(lemmas):
    return [lemma2id[lemma] for lemma in lemmas]

def add_nodes(df):
    df["nodes"] = df.lemmas.apply(get_nodes)

add_nodes(df_trn)
add_nodes(df_dev)
add_nodes(df_test)
add_nodes(df_exp)

def test_nodes(i=0):
    print(df_trn.q_reformat.iloc[i])
    print(df_lemma.iloc[df_trn.nodes.iloc[i]])
    
test_nodes()

In [None]:
# Nearest neighbours from embedding
!pip install -q annoy
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(512, "angular")
for i in range(len(df_exp)):
    annoy_index.add_item(i, df_exp.embedding.iloc[i])
annoy_index.build(10)

# Add nearest neighbour explanation ids for each embedding
def add_nn(df, n=100):
    df["nn_exp"] = df.embedding.apply(
        lambda emb: annoy_index.get_nns_by_vector(emb, n))
    
add_nn(df_trn)
add_nn(df_dev)
add_nn(df_test)
add_nn(df_exp)

def test_annoy(i=0):
    print(df_trn.q_reformat.iloc[i])
    print(df_exp.iloc[df_trn.nn_exp.iloc[i]].text.values[:3])
    
test_annoy()

In [None]:
# Example question
q = df_trn.sample(1, random_state=42)
q

In [None]:
# Get facts for question
e = df_exp.iloc[q.exp_idxs.iloc[0]]
e

In [None]:
# Get lemma token nodes for first sentence
n = df_lemma.iloc[e.nodes.iloc[0]]
n

###Training

####Ranking

In [None]:
"""
Start with question
Get seed, n most relevant
Add to ranking dict with idx
For each item in ranking dict, add the nearest neighbour to ranking dict that is not already inside
Repeat until no more change
"""

def repeat(seed):
    ranking = {item: idx for idx, item in enumerate(seed)}
    nns = {item: list(df_exp.nn_exp.iloc[item]) for item in range(len(df_exp))}
    
    while True:
        old_ranking = dict(ranking)
#         print(len(old_ranking), old_ranking)
        
        for item in old_ranking.keys():
            while True:
                if len(nns[item]) == 0:
                    break
                n = nns[item].pop(0)
                if n not in ranking:
                    ranking[n] = len(ranking)
                    break
        if len(ranking) == len(old_ranking):
            break
            
    # low rank -> high importance -> high score
    scores = {k: -v for k, v in ranking.items()}
    return scores

In [None]:
"""
Start with question
Get n most relevant/similar
+1 point each
Recurse up to iter times 
Final rank on points
(MAP=0.04)
"""

def recurse(seed, scores, iter, n=100, max_iter=2):
    if iter < max_iter:    
        for idx in seed[:n]:
            if idx in scores:
                scores[idx] += 1
            else:
                scores[idx] = 1

        for idx in seed[:n]:
            if scores[idx] > 1:
                continue
            new_seed = df_exp.nn_exp.iloc[idx][1:]  # skip nearest (itself)
            recurse(new_seed, scores, iter+1, n, max_iter)
        
def get_ranking(scores_dict):
    idxs = list(scores_dict.keys())
    idxs = sorted(idxs, key=lambda idx: scores_dict[idx], reverse=True)
    return {idx: rank for rank, idx in enumerate(idxs)}
        
def test_recurse(i=0):
    # Time: about 1s to run 100x
    print("\nQuestion:", df_trn.q_reformat.iloc[i])
    nearest = df_trn.nn_exp.iloc[i]
    
    scores = {}
    recurse(seed=nearest, scores=scores, iter=0)
    
#     scores = repeat(nearest)  # test the output of repeat algo

    print("\nNum sentences seen:", len(scores), "\nScores:", scores)
    ranking = get_ranking(scores)
    
#     print("\nSample of initial seeds:")
#     for idx in nearest[:5]:
#         print("Pred rank:", ranking.get(idx), "\tText:", df_exp.iloc[idx].text)
    
    n_show = 10
    print("\nTop ranking:", n_show)
    idxs = list(scores.keys())
    print(df_exp.iloc[sorted(idxs, key=lambda idx: scores[idx], reverse=True)[:n_show]].text.values)
    
    print("\nGold explanations")
    for idx in df_trn.iloc[i].exp_idxs:
        print("Pred rank:", ranking.get(idx), "\tText:", df_exp.iloc[idx].text)
    
for i in np.random.choice(len(df_trn), 10):
    test_recurse(i)
    print("#" * 100)

In [None]:
def simple_nn_ranking(df, idx):
    # 1000x takes 3.4s
    return annoy_index.get_nns_by_vector(df.embedding.iloc[idx], n=len(df_exp))

def test_simple_nn_ranking(i=0):
    # Time: about 1s to run 100x
    print("\nQuestion:", df_trn.q_reformat.iloc[i])
    ranking = simple_nn_ranking(df_trn, i)
    
    n_show = 10
    print("\nTop ranking:", n_show)
    print(df_exp.iloc[ranking[:n_show]].text.values)
    
    print("\nGold explanations")
    for idx in df_trn.iloc[i].exp_idxs:
        print("Pred rank:", ranking.index(idx), "\tText:", df_exp.iloc[idx].text)
    
for i in np.random.choice(len(df_trn), 10):
    test_simple_nn_ranking(i)
    print("#" * 100)

In [None]:
# # Predicting combined embedding of answer sentences then nearest neighbour ranking (MAP=0.17)
# def make_xy(df):
#     embs_q = []
#     concat_sentences = []
    
#     def concat(sentences):
#         # assuming sentences have no punctuation at the end
#         return ". ".join([s.strip() for s in sentences])
    
#     for i in range(len(df)):
#         idxs_exp = df.exp_idxs.iloc[i]
#         sentences = df_exp.text.iloc[idxs_exp].values
#         if len(sentences) == 0:
#             continue
#         concat_sentences.append(concat(sentences))
#         embs_q.append(df.embedding.iloc[i])
        
#     embs_concat_sentences = embed_texts(concat_sentences)
    
#     x = np.stack(embs_q)
#     y = embs_concat_sentences
#     print(x.shape, y.shape)
#     return x, y
    
# # def test_make_xy():
# #     make_xy(df_trn[:10])
    
# # test_make_xy()

# x_trn, y_trn = make_xy(df_trn)
# x_dev, y_dev = make_xy(df_dev)

# from sklearn import linear_model, metrics

# model = linear_model.Ridge()
# model.fit(x_trn, y_trn)

# import scipy

# def cosine_metric(y_true, y_pred):
#     assert len(y_true) == len(y_pred)
    
#     pairwise_matrix = metrics.pairwise.cosine_similarity(y_true, y_pred)
#     return np.mean([pairwise_matrix[i][i] for i in range(len(y_true))])

#     # Equivalent
#     # return 1 - np.mean([scipy.spatial.distance.cosine(a, b) for a, b in zip(y_true, y_pred)])

# print("Cosine similarity from question to embeds of concat sentences:", cosine_metric(x_dev, y_dev))
# print("Cosine similarity from pred emb to embeds of concat sentences:", cosine_metric(y_dev, model.predict(x_dev)))

# def pred_emb_nn_ranking(df, idx, n=5000):
#     inp = [df.embedding.iloc[idx]]
#     out = model.predict(inp)
#     pred_emb = out[0]
#     return annoy_index.get_nns_by_vector(pred_emb, n=len(df_exp))

In [None]:
from sklearn import feature_extraction, metrics

def get_tfidf_ranking(df, field_q="q_reformat", field_e="text"):
    # "q_reformat" instead of "Question"      MAP +0.07 (0.24 -> 0.31)
    # tfidf stop_words="english"              MAP +0.01 (0.31 -> 0.32)
    # field_q and field_e = "lemmas"          MAP +0.08 (0.32 -> 0.40)
    # tfidf binary=True                       MAP +0.03 (0.40 -> 0.43)
    # tfidf ngram_range=(1,2)                 MAP -0.05 (0.43 -> 0.38)
    # tfidf ngram_range=(1,3)                 MAP -0.07 (0.43 -> 0.36)
    
    def preprocess(lst):
        if type(lst[0]) == str:
            return lst
        elif type(lst[0]) == list:
            return [" ".join(sublst) for sublst in lst]
        else:
            raise TypeError("unknown data type")
    
    ranking = []
    q = preprocess(df[field_q].tolist())
    e = preprocess(df_exp[field_e].tolist())
    
    vectorizer = feature_extraction.text.TfidfVectorizer(
        stop_words="english", binary=True)
    vectorizer.fit(q + e)
    X_q = vectorizer.transform(q)
    X_e = vectorizer.transform(e)
    X_dist = metrics.pairwise.cosine_distances(X_q, X_e)
    
    for i_question, distances in enumerate(X_dist):
        ranking.append([])
        for i_explanation in np.argsort(distances):
            ranking[-1].append(i_explanation)

    return ranking

def add_missing_idxs(old):
    set_old = set(old)
    set_all = set(np.arange(0, len(df_exp)))
    missing = list(set_all - set_old)
    np.random.shuffle(missing)
    new = list(old)
    new.extend(missing)
    assert len(new) == len(df_exp), (len(new), len(df_exp), len(missing))
    assert all([a == b  for a, b in zip(new[:len(old)], old)])
    assert set(new) == set_all
    return new

from tqdm import tqdm

def format_predict_line(questionID, explanation_uid):
    # Adapted from tfidf baseline script
    return "{}\t{}".format(questionID, explanation_uid)

def get_preds(df, df_exp):
    preds = []
    tfidf_ranking = get_tfidf_ranking(df)
    tfidf_ranking = get_tfidf_ranking(df, "lemmas", "lemmas")
    
    for i in tqdm(range(len(df))):
        
#         # Repeating nn algo (23.7)
#         nearest = df.nn_exp.iloc[i]
#         scores = repeat(nearest)
#         ranked_idxs = sorted(list(scores.keys()), key=lambda idx: scores[idx], reverse=True)
        
#         # Recursive nn algo (0.07)
#         nearest = df.nn_exp.iloc[i]
#         scores = {}
#         recurse(seed=nearest, scores=scores, iter=0)
#         ranked_idxs = sorted(list(scores.keys()), key=lambda idx: scores[idx], reverse=True)
        
#         # Simple nn algo (0.24)
#         ranked_idxs = simple_nn_ranking(df, i)
        
#         # Predict concat answer embedding algo (0.17)
#         ranked_idxs = pred_emb_nn_ranking(df, i)

        # Tfidf
        ranked_idxs = tfidf_ranking[i]

        if len(ranked_idxs) < len(df_exp):
            ranked_idxs = add_missing_idxs(ranked_idxs)
        
        questionID = df.questionID.iloc[i]
        for idx in ranked_idxs:
            preds.append(format_predict_line(questionID, df_exp.uid.iloc[idx]))
    return preds
        
def test_get_preds():
    preds = get_preds(df_dev[:10], df_exp)
    print("Num preds:", len(preds))
    print("Prediction lines:", preds)
    
test_get_preds()

def write_preds(preds, path="predict.txt"):
    with open(path, "w") as f:
        f.write("\n".join(preds))
        
def test_write_preds():
    preds = ['VASoL_2008_3_26\t14de-6699-6b2e-a5d1', 'VASoL_2008_3_26\t14de-6699-6b2e-a5d1']
    path = "temp.txt"
    write_preds(preds, path)
    with open(path) as f:
        for line in f:
            print(repr(line))
            
test_write_preds()

write_preds(get_preds(df_trn, df_exp))  # for dev set
# write_preds(get_preds(df_dev, df_exp))  # for dev set
# write_preds(get_preds(df_dev, df_exp))  # for test set
!cd {path_data} && python ../evaluate.py --gold=questions/ARC-Elementary+EXPL-Train.tsv /content/predict.txt
# !cd {path_data} && python ../evaluate.py --gold=questions/ARC-Elementary+EXPL-Dev.tsv /content/predict.txt