##TextGraphs

###Preprocessing

In [1]:
# Setup
from pathlib import Path
import spacy 
import pandas as pd
import sys
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import re

path_data = "tg2019task/worldtree_corpus_textgraphs2019sharedtask_withgraphvis"
if not Path(path_data).exists():
    # Download data
    !git clone -q https://github.com/umanlp/tg2019task.git
    !cd tg2019task/ && make dataset
    # Run baseline tfidf (expected MAP: 0.054)
    # The baseline only submits the top n=10 sentences per question by default
    # Increasing n to 5000 results in MAP >= 0.24 (cut short due to slow processing)
    !cd {path_data} && python ../baseline_tfidf.py annotation/expl-tablestore-export-2017-08-25-230344/tables questions/ARC-Elementary+EXPL-Dev.tsv > predict.txt
    !cd {path_data} && python ../evaluate.py --gold=questions/ARC-Elementary+EXPL-Dev.tsv predict.txt

sys.path.append("tg2019task")
from baseline_tfidf import read_explanations

Please note that this distribution is still subject to the terms set forth in the included license.
See the full license for specific details: EULA AI2 Mercury Dataset 01012018.docx
curl -sL -o "worldtree_corpus_textgraphs2019sharedtask_withgraphvis.zip" 'http://cognitiveai.org/dist/worldtree_corpus_textgraphs2019sharedtask_withgraphvis.zip'
sha256sum -c "worldtree_corpus.sha256"
worldtree_corpus_textgraphs2019sharedtask_withgraphvis.zip: OK
unzip -o worldtree_corpus_textgraphs2019sharedtask_withgraphvis.zip
Archive:  worldtree_corpus_textgraphs2019sharedtask_withgraphvis.zip
   creating: worldtree_corpus_textgraphs2019sharedtask_withgraphvis/
  inflating: worldtree_corpus_textgraphs2019sharedtask_withgraphvis/EULA AI2 Mercury Dataset 01012018.docx  
  inflating: worldtree_corpus_textgraphs2019sharedtask_withgraphvis/README.txt  
   creating: worldtree_corpus_textgraphs2019sharedtask_withgraphvis/annotation/
   creating: worldtree_corpus_textgraphs2019sharedtask_withgraphvis/annotation

In [2]:
# Embedding and Spacy tokenizer
def embed_texts(texts):
    # Wrap with Keras model for convenient batching and progress bar
    # Adapted from "Keras + Universal Sentence Encoder = Transfer Learning for text data"
    texts = np.asarray(texts)
    tf.keras.backend.clear_session()
    
    module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
    embed = hub.Module(module_url)
    
    def UniversalEmbedding(x):
        return embed(tf.squeeze(tf.cast(x, tf.string)), 
            signature="default", as_dict=True)["default"]
    
    inp = tf.keras.layers.Input(shape=(1,), dtype=tf.string)
    out = tf.keras.layers.Lambda(UniversalEmbedding)(inp)
    model = tf.keras.Model(inp, out)
    
    with tf.Session() as sess:
        tf.keras.backend.set_session(sess)
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
    
        embeds = model.predict(texts, batch_size=128, verbose=True)
        
    tf.keras.backend.clear_session()  # avoid session closed error
    return embeds

def test_embed_texts():
    print(embed_texts(["hello", "bye"]))
    
test_embed_texts()
    
def preprocess_texts(texts, remove_stop=True, remove_punct=True):
    nlp = spacy.load("en_core_web_sm")
    tokens = []
    lemmas = []
    for doc in nlp.pipe(texts, disable=["ner", "tagger", "parser"]):
        _tokens = []
        _lemmas = []
        for token in doc:
            if token.is_stop and remove_stop:
                continue
            if token.is_punct and remove_punct:
                continue
            _tokens.append(token.text)
            _lemmas.append(token.lemma_)
        tokens.append(_tokens)
        lemmas.append(_lemmas)

    return tokens, lemmas

def test_preprocess_texts():
    print(preprocess_texts(["Which of these will most likely increase?", "Habitats support animals."]))
    
test_preprocess_texts()

[[-0.0039425  -0.06877444  0.04911328 ...  0.0048102  -0.01451661
   0.0276317 ]
 [ 0.01008849 -0.03508965  0.05813815 ...  0.07674593 -0.04790611
  -0.01093433]]
([['likely', 'increase'], ['Habitats', 'support', 'animals']], [['likely', 'increase'], ['Habitats', 'support', 'animal']])


In [3]:
# Extract fact sentences
path_tables = Path(path_data).joinpath("annotation/expl-tablestore-export-2017-08-25-230344/tables")

def get_df_explanations(path_tables):
    explanations = []
    for p in path_tables.iterdir():
        explanations += read_explanations(str(p))
    df = pd.DataFrame(explanations, columns=("uid", "text"))  # 3 duplicate uids
    df = df.drop_duplicates("uid")
    tokens, lemmas = preprocess_texts(df.text)
    embeds = [row for row in embed_texts(df.text.tolist())]
    df["tokens"], df["lemmas"], df["embedding"] = tokens, lemmas, embeds
    print(df.shape)
    return df

df_exp = get_df_explanations(path_tables)
uid2idx = {uid: idx for idx, uid in enumerate(df_exp.uid.values)}
print(list(uid2idx.items())[:3])
df_exp.head()



(4947, 5)
[('bb32-0bc0-3629-6bca', 0), ('1966-99de-7765-39de', 1), ('73df-7e6e-db00-ae55', 2)]


Unnamed: 0,uid,text,tokens,lemmas,embedding
0,bb32-0bc0-3629-6bca,a vehicle for something allows; enables that s...,"[vehicle, allows, enables, occur]","[vehicle, allow, enable, occur]","[0.05257905, 0.013029384, 0.08465949, -0.06280..."
1,1966-99de-7765-39de,adding heat to something kills viruses; bacter...,"[adding, heat, kills, viruses, bacteria]","[add, heat, kill, virus, bacterium]","[-0.03287766, 0.05129391, 0.10405169, -0.01251..."
2,73df-7e6e-db00-ae55,some adult animals lay eggs,"[adult, animals, lay, eggs]","[adult, animal, lie, egg]","[0.051239617, 0.013141562, 0.054617207, -0.022..."
3,f992-5698-76aa-c6de,a warm front is when warm air mass rises and p...,"[warm, warm, air, mass, rises, passes, cold, a...","[warm, warm, air, mass, rise, pass, cold, air,...","[-0.0671587, 0.102374874, -0.0017190779, -0.00..."
4,3ad2-6e55-7ae1-f182,"an airplane flies at high altitudes, between 5...","[airplane, flies, high, altitudes, 5000, 30000...","[airplane, fly, high, altitude, 5000, 30000, ft]","[0.0014613355, -0.0060601756, 0.007189139, 0.0..."


In [4]:
# Format question examples
path_questions = Path(path_data).joinpath("questions")

def extract_explanation(exp_string):
    if type(exp_string) != str:
        return [], []
    uids = []
    roles = []
    for uid_and_role in exp_string.split():
        uid, role = uid_and_role.split("|")
        uids.append(uid)
        roles.append(role)
    return uids, roles

def split_question(q_string):
    # split on option parantheses eg "(A)"
    return re.compile("\(.\)").split(q_string)

def test_split_question():
    print(split_question('Which process? (A) flying (B) talking (C) seeing (D) reproducing (E) something'))
    print(split_question('Which process? (A) flying (B) talking (C) seeing (D) reproducing'))
    print(split_question('Which process? (A) flying (B) talking (C) seeing'))
    
test_split_question()

def get_questions(path_questions, fname):
    df = pd.read_csv(Path(path_questions).joinpath(fname), sep="\t")
    
    # Reformat question
    q_reformat = []
    questions = df.Question.values
    answers = df["AnswerKey.1"].values
    char2idx = {char: idx for idx, char in enumerate(list("ABCDE"))}
    
    for i in range(len(df)):
        q, *options = split_question(questions[i])
        idx_option = char2idx[answers[i]]
        q_reformat.append(" ".join([q.strip(), options[idx_option].strip()]))
    df["q_reformat"] = q_reformat

    # Preprocess texts
    tokens, lemmas = preprocess_texts(df.q_reformat)
    embeds = [row for row in embed_texts(df.q_reformat.tolist())]
    df["tokens"], df["lemmas"], df["embedding"] = tokens, lemmas, embeds

    # Get explanation uids and roles
    exp_uids = []
    exp_roles = []
    exp_idxs = []
    for exp_string in df.explanation.values:
        _uids, _roles = extract_explanation(exp_string)
        uids = []
        roles = []
        idxs = []
        assert len(_uids) == len(_roles)
        for i in range(len(_uids)):
            if _uids[i] not in uid2idx:
                continue
            uids.append(_uids[i])
            roles.append(_roles[i])
            idxs.append(uid2idx[_uids[i]])
        exp_uids.append(uids)
        exp_roles.append(roles)
        exp_idxs.append(idxs)
    df["exp_uids"], df["exp_roles"], df["exp_idxs"] = exp_uids, exp_roles, exp_idxs

    print(df.shape)
    return df

df_trn = get_questions(path_questions, "ARC-Elementary+EXPL-Train.tsv")
df_dev = get_questions(path_questions, "ARC-Elementary+EXPL-Dev.tsv")
df_test = get_questions(path_questions, "ARC-Elementary+EXPL-Test-Masked.tsv")
df_trn.sample(3, random_state=42)

['Which process? ', ' flying ', ' talking ', ' seeing ', ' reproducing ', ' something']
['Which process? ', ' flying ', ' talking ', ' seeing ', ' reproducing']
['Which process? ', ' flying ', ' talking ', ' seeing']
(1190, 25)
(264, 25)
(1247, 25)


Unnamed: 0,questionID,originalQuestionID,totalPossiblePoint,AnswerKey,isMultipleChoiceQuestion,includesDiagram,examName,grade,year,AnswerKey.1,Question,explanation,notes,explanation annotators,flags,subject,category,fold,q_reformat,tokens,lemmas,embedding,exp_uids,exp_roles,exp_idxs
390,NYSEDREGENTS_2005_4_11,11,1,B,1,0,NYSEDREGENTS,4,2005,B,Animals get energy for growth and repair from ...,7d29-0a2c-6959-b1dd|CENTRAL 86d3-723e-62b0-6df...,Nutrients are required for growth and energy. ...,SM,SUCCESS,,Train,Easy,Animals get energy for growth and repair from ...,"[Animals, energy, growth, repair, food]","[Animals, energy, growth, repair, food]","[0.033545215, 0.051246587, 0.12048633, -0.0278...","[7d29-0a2c-6959-b1dd, 86d3-723e-62b0-6dfa, 12e...","[CENTRAL, LEXGLUE, CENTRAL, CENTRAL, CENTRAL, ...","[3761, 2102, 3764, 3623, 3764, 4661]"
247,Mercury_SC_417675,417675,1,B,1,0,Mercury,4,2015,B,"At night, the Moon is the brightest object in ...",a423-40e8-3886-4df5|CENTRAL a538-175f-9223-d11...,,EW SM,SUCCESS,,Train,Easy,"At night, the Moon is the brightest object in ...","[night, Moon, brightest, object, sky, Moon, li...","[night, Moon, bright, object, sky, Moon, light...","[0.02291864, 0.07425138, 0.009133951, -0.04619...","[a423-40e8-3886-4df5, a538-175f-9223-d117, 933...","[CENTRAL, CENTRAL, LEXGLUE]","[139, 3668, 1951]"
260,Mercury_SC_408661,408661,1,D,1,0,Mercury,4,2015,D,Which would cause the most soil to erode from ...,4c93-e1e3-546a-8cfa|CENTRAL a52a-1b59-1f8c-028...,,EW,SUCCESS,,Train,Easy,Which would cause the most soil to erode from ...,"[cause, soil, erode, river, bank, days, water,...","[cause, soil, erode, river, bank, day, water, ...","[-0.027729973, -0.009487403, -0.005197015, -0....","[4c93-e1e3-546a-8cfa, a52a-1b59-1f8c-028e, 988...","[CENTRAL, GROUNDING, GROUNDING, GROUNDING, CEN...","[3700, 664, 1111, 403, 3193]"


In [5]:
# Build unique lemma/token data
def flatten(nested_list):
    return [item for lst in nested_list for item in lst]

def get_flattened_items(dfs, field):
    all_items = []
    for df in dfs:
        all_items.extend(flatten(df[field]))
    print(len(all_items))
    return all_items

all_lemmas = get_flattened_items([df_trn, df_dev, df_test, df_exp], "lemmas")
# all_tokens = get_flattened_items([df_trn, df_dev, df_test, df_exp], "tokens")
    
unique_lemmas = sorted(list(set(all_lemmas)))
print(len(unique_lemmas))
lemma2id = {lemma:idx for idx, lemma in enumerate(unique_lemmas)}
df_lemma = pd.DataFrame({
    "node": unique_lemmas,
    "embedding": [row for row in embed_texts(unique_lemmas)]
})

# Extract lemma node ids for every sentence
def get_nodes(lemmas):
    return [lemma2id[lemma] for lemma in lemmas]

def add_nodes(df):
    df["nodes"] = df.lemmas.apply(get_nodes)

add_nodes(df_trn)
add_nodes(df_dev)
add_nodes(df_test)
add_nodes(df_exp)

def test_nodes(i=0):
    print(df_trn.q_reformat.iloc[i])
    print(df_lemma.iloc[df_trn.nodes.iloc[i]])
    
test_nodes()

52866
4317
Which of these will most likely increase a plant population in a habitat? more days of rain showers
            node                                          embedding
2911      likely  [-0.0019083042, -0.022864403, 0.04777484, -0.0...
2743    increase  [-0.033307906, -0.0016311363, 0.016896605, -0....
3320       plant  [0.05033428, -0.051691182, 0.043068256, 0.0195...
3355  population  [-0.057624135, -0.074326694, 0.011713906, -0.0...
2611     habitat  [0.0028798461, -0.00958787, 0.029946582, -0.04...
2075         day  [0.013448668, -0.016635928, 0.027438134, 0.014...
3463        rain  [-0.021921862, -0.055525396, -0.019448007, 0.0...
3748      shower  [-0.050143655, -0.032492545, -0.020014787, 0.0...


In [6]:
# Nearest neighbours from embedding
!pip install -q annoy
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(512, "angular")
for i in range(len(df_exp)):
    annoy_index.add_item(i, df_exp.embedding.iloc[i])
annoy_index.build(10)

# Add nearest neighbour explanation ids for each embedding
def add_nn(df, n=100):
    df["nn_exp"] = df.embedding.apply(
        lambda emb: annoy_index.get_nns_by_vector(emb, n))
    
add_nn(df_trn)
add_nn(df_dev)
add_nn(df_test)
add_nn(df_exp)

def test_annoy(i=0):
    print(df_trn.q_reformat.iloc[i])
    print(df_exp.iloc[df_trn.nn_exp.iloc[i]].text.values[:3])
    
test_annoy()

[?25l[K     |▌                               | 10kB 20.5MB/s eta 0:00:01[K     |█                               | 20kB 1.7MB/s eta 0:00:01[K     |█▌                              | 30kB 2.5MB/s eta 0:00:01[K     |██                              | 40kB 1.6MB/s eta 0:00:01[K     |██▋                             | 51kB 2.0MB/s eta 0:00:01[K     |███                             | 61kB 2.4MB/s eta 0:00:01[K     |███▋                            | 71kB 2.8MB/s eta 0:00:01[K     |████▏                           | 81kB 3.1MB/s eta 0:00:01[K     |████▋                           | 92kB 3.5MB/s eta 0:00:01[K     |█████▏                          | 102kB 2.7MB/s eta 0:00:01[K     |█████▋                          | 112kB 2.7MB/s eta 0:00:01[K     |██████▏                         | 122kB 2.7MB/s eta 0:00:01[K     |██████▊                         | 133kB 2.7MB/s eta 0:00:01[K     |███████▏                        | 143kB 2.7MB/s eta 0:00:01[K     |███████▊                  

In [7]:
# Example question
q = df_trn.sample(1, random_state=42)
q

Unnamed: 0,questionID,originalQuestionID,totalPossiblePoint,AnswerKey,isMultipleChoiceQuestion,includesDiagram,examName,grade,year,AnswerKey.1,Question,explanation,notes,explanation annotators,flags,subject,category,fold,q_reformat,tokens,lemmas,embedding,exp_uids,exp_roles,exp_idxs,nodes,nn_exp
390,NYSEDREGENTS_2005_4_11,11,1,B,1,0,NYSEDREGENTS,4,2005,B,Animals get energy for growth and repair from ...,7d29-0a2c-6959-b1dd|CENTRAL 86d3-723e-62b0-6df...,Nutrients are required for growth and energy. ...,SM,SUCCESS,,Train,Easy,Animals get energy for growth and repair from ...,"[Animals, energy, growth, repair, food]","[Animals, energy, growth, repair, food]","[0.033545215, 0.051246587, 0.12048633, -0.0278...","[7d29-0a2c-6959-b1dd, 86d3-723e-62b0-6dfa, 12e...","[CENTRAL, LEXGLUE, CENTRAL, CENTRAL, CENTRAL, ...","[3761, 2102, 3764, 3623, 3764, 4661]","[184, 2289, 2604, 3539, 2470]","[3623, 3764, 3763, 3761, 3637, 25, 3770, 4596,..."


In [8]:
# Get facts for question
e = df_exp.iloc[q.exp_idxs.iloc[0]]
e

Unnamed: 0,uid,text,tokens,lemmas,embedding,nodes,nn_exp
3764,7d29-0a2c-6959-b1dd,an animal needs to eat food for nutrients,"[animal, needs, eat, food, nutrients]","[animal, need, eat, food, nutrient]","[0.00699302, 0.03030177, 0.08051441, -0.008129...","[1537, 3114, 2248, 2470, 3160]","[3761, 4596, 3764, 3763, 3770, 4404, 949, 3623..."
2104,86d3-723e-62b0-6dfa,to repair means to heal,"[repair, means, heal]","[repair, mean, heal]","[0.04778111, 0.035651844, 0.076263756, -0.0752...","[3539, 3000, 2638]","[2102, 1877, 2199, 2050, 2159, 2200, 2204, 220..."
3767,12ed-f3da-04db-3ddc,an animal requires nutrients to grow and heal,"[animal, requires, nutrients, grow, heal]","[animal, require, nutrient, grow, heal]","[0.061348762, 0.046158213, 0.10950935, -0.0289...","[1537, 3552, 3160, 2602, 2638]","[3764, 3763, 3770, 3761, 4596, 3832, 949, 3822..."
3626,5753-2812-04c2-af70,food is a source of energy for animals; plants,"[food, source, energy, animals, plants]","[food, source, energy, animal, plant]","[-0.0064824903, 0.029853553, 0.11760405, -0.03...","[2470, 3831, 2289, 1537, 3320]","[3623, 3637, 4596, 3645, 3625, 3770, 3763, 364..."
3767,12ed-f3da-04db-3ddc,an animal requires nutrients to grow and heal,"[animal, requires, nutrients, grow, heal]","[animal, require, nutrient, grow, heal]","[0.061348762, 0.046158213, 0.10950935, -0.0289...","[1537, 3552, 3160, 2602, 2638]","[3764, 3763, 3770, 3761, 4596, 3832, 949, 3822..."
4664,e6fb-12a1-939d-e850,one can get something from a source,[source],[source],"[0.028167987, 0.066189945, 0.07729534, -0.0110...",[3831],"[4661, 1951, 3661, 3915, 2224, 3340, 3849, 198..."


In [9]:
# Get lemma token nodes for first sentence
n = df_lemma.iloc[e.nodes.iloc[0]]
n

Unnamed: 0,node,embedding
1537,animal,"[0.060670264, -0.020717913, 0.08487106, -0.045..."
3114,need,"[0.04453363, -0.076736875, -0.005659547, -0.05..."
2248,eat,"[-0.0055706496, -0.037961073, 0.03723368, 0.01..."
2470,food,"[-0.012542172, -0.03672416, 0.040720686, 0.008..."
3160,nutrient,"[-0.032169994, 0.0069659757, 0.034218255, -0.0..."


###Training

####Ranking

In [0]:
"""
Start with question
Get seed, n most relevant
Add to ranking dict with idx
For each item in ranking dict, add the nearest neighbour to ranking dict that is not already inside
Repeat until no more change
"""

def repeat(seed):
    ranking = {item: idx for idx, item in enumerate(seed)}
    nns = {item: list(df_exp.nn_exp.iloc[item]) for item in range(len(df_exp))}
    
    while True:
        old_ranking = dict(ranking)
#         print(len(old_ranking), old_ranking)
        
        for item in old_ranking.keys():
            while True:
                if len(nns[item]) == 0:
                    break
                n = nns[item].pop(0)
                if n not in ranking:
                    ranking[n] = len(ranking)
                    break
        if len(ranking) == len(old_ranking):
            break
            
    # low rank -> high importance -> high score
    scores = {k: -v for k, v in ranking.items()}
    return scores

In [11]:
"""
Start with question
Get n most relevant/similar
+1 point each
Recurse up to iter times 
Final rank on points
(MAP=0.04)
"""

def recurse(seed, scores, iter, n=100, max_iter=2):
    if iter < max_iter:    
        for idx in seed[:n]:
            if idx in scores:
                scores[idx] += 1
            else:
                scores[idx] = 1

        for idx in seed[:n]:
            if scores[idx] > 1:
                continue
            new_seed = df_exp.nn_exp.iloc[idx][1:]  # skip nearest (itself)
            recurse(new_seed, scores, iter+1, n, max_iter)
        
def get_ranking(scores_dict):
    idxs = list(scores_dict.keys())
    idxs = sorted(idxs, key=lambda idx: scores_dict[idx], reverse=True)
    return {idx: rank for rank, idx in enumerate(idxs)}
        
def test_recurse(i=0):
    # Time: about 1s to run 100x
    print("\nQuestion:", df_trn.q_reformat.iloc[i])
    nearest = df_trn.nn_exp.iloc[i]
    
    scores = {}
    recurse(seed=nearest, scores=scores, iter=0)
    
#     scores = repeat(nearest)  # test the output of repeat algo

    print("\nNum sentences seen:", len(scores), "\nScores:", scores)
    ranking = get_ranking(scores)
    
#     print("\nSample of initial seeds:")
#     for idx in nearest[:5]:
#         print("Pred rank:", ranking.get(idx), "\tText:", df_exp.iloc[idx].text)
    
    n_show = 10
    print("\nTop ranking:", n_show)
    idxs = list(scores.keys())
    print(df_exp.iloc[sorted(idxs, key=lambda idx: scores[idx], reverse=True)[:n_show]].text.values)
    
    print("\nGold explanations")
    for idx in df_trn.iloc[i].exp_idxs:
        print("Pred rank:", ranking.get(idx), "\tText:", df_exp.iloc[idx].text)
    
for i in np.random.choice(len(df_trn), 10):
    test_recurse(i)
    print("#" * 100)


Question: Which stage in the reptile life cycle is most like the birth of a lion cub? hatching from an egg

Num sentences seen: 530 
Scores: {3757: 4, 3754: 7, 3231: 5, 3747: 7, 134: 7, 189: 7, 2955: 7, 1840: 4, 11: 8, 4602: 3, 2381: 6, 2: 5, 3758: 4, 30: 6, 2902: 4, 7: 6, 739: 4, 138: 1, 20: 4, 2976: 4, 3805: 4, 3052: 4, 3750: 4, 2339: 6, 133: 5, 3261: 3, 95: 5, 2362: 4, 144: 3, 2347: 5, 75: 4, 4690: 3, 2826: 4, 34: 4, 3303: 3, 3948: 6, 86: 3, 3959: 3, 36: 3, 2924: 4, 3061: 2, 3243: 4, 3697: 5, 4084: 4, 2835: 3, 3749: 3, 39: 4, 159: 4, 3219: 1, 329: 5, 3201: 3, 1290: 3, 2928: 2, 2790: 1, 3372: 3, 2824: 4, 2789: 5, 4063: 2, 3319: 4, 688: 3, 4136: 3, 17: 2, 3992: 3, 4180: 4, 3751: 2, 1392: 1, 1335: 4, 3274: 3, 2822: 5, 4419: 2, 1137: 4, 4874: 2, 1099: 5, 3688: 3, 3030: 3, 9: 3, 1830: 2, 883: 2, 2335: 1, 3396: 2, 2334: 2, 1098: 2, 328: 6, 4877: 3, 3229: 3, 3656: 4, 4382: 1, 2858: 3, 3175: 5, 822: 2, 8: 4, 685: 3, 2868: 2, 362: 3, 851: 5, 3239: 2, 26: 3, 4563: 3, 3318: 2, 1150: 3, 2975: 

In [12]:
def simple_nn_ranking(df, idx):
    # 1000x takes 3.4s
    return annoy_index.get_nns_by_vector(df.embedding.iloc[idx], n=len(df_exp))

def test_simple_nn_ranking(i=0):
    # Time: about 1s to run 100x
    print("\nQuestion:", df_trn.q_reformat.iloc[i])
    ranking = simple_nn_ranking(df_trn, i)
    
    n_show = 10
    print("\nTop ranking:", n_show)
    print(df_exp.iloc[ranking[:n_show]].text.values)
    
    print("\nGold explanations")
    for idx in df_trn.iloc[i].exp_idxs:
        print("Pred rank:", ranking.index(idx), "\tText:", df_exp.iloc[idx].text)
    
for i in np.random.choice(len(df_trn), 10):
    test_simple_nn_ranking(i)
    print("#" * 100)


Question: A function of a plant's roots is to absorb water and minerals.

Top ranking: 10
['roots are a vehicle for absorbing water and nutrients from soil into the plant'
 'plants absorb nutrients; water; oxygen from soil into themselves through their roots'
 'xylem carries water from the roots of a plant to the leaves of a plant'
 'a plant requires photosynthesis to grow; survive'
 'In the erosion process roots of plants are an inhibitor'
 'plants are a source of oxygen through photosynthesis'
 'plants control the amount of water in their leaves through transpiration'
 'a plant stem is the vehicle for transporting water and food from roots to the rest of the plant'
 'soil contains nutrients for plants'
 'a plants; living things require water for survival; to grow']

Gold explanations
Pred rank: 1 	Text: plants absorb nutrients; water; oxygen from soil into themselves through their roots
Pred rank: 0 	Text: roots are a vehicle for absorbing water and nutrients from soil into the plan

In [0]:
# # Predicting combined embedding of answer sentences then nearest neighbour ranking (MAP=0.17)
# def make_xy(df):
#     embs_q = []
#     concat_sentences = []
    
#     def concat(sentences):
#         # assuming sentences have no punctuation at the end
#         return ". ".join([s.strip() for s in sentences])
    
#     for i in range(len(df)):
#         idxs_exp = df.exp_idxs.iloc[i]
#         sentences = df_exp.text.iloc[idxs_exp].values
#         if len(sentences) == 0:
#             continue
#         concat_sentences.append(concat(sentences))
#         embs_q.append(df.embedding.iloc[i])
        
#     embs_concat_sentences = embed_texts(concat_sentences)
    
#     x = np.stack(embs_q)
#     y = embs_concat_sentences
#     print(x.shape, y.shape)
#     return x, y
    
# # def test_make_xy():
# #     make_xy(df_trn[:10])
    
# # test_make_xy()

# x_trn, y_trn = make_xy(df_trn)
# x_dev, y_dev = make_xy(df_dev)

# from sklearn import linear_model, metrics

# model = linear_model.Ridge()
# model.fit(x_trn, y_trn)

# import scipy

# def cosine_metric(y_true, y_pred):
#     assert len(y_true) == len(y_pred)
    
#     pairwise_matrix = metrics.pairwise.cosine_similarity(y_true, y_pred)
#     return np.mean([pairwise_matrix[i][i] for i in range(len(y_true))])

#     # Equivalent
#     # return 1 - np.mean([scipy.spatial.distance.cosine(a, b) for a, b in zip(y_true, y_pred)])

# print("Cosine similarity from question to embeds of concat sentences:", cosine_metric(x_dev, y_dev))
# print("Cosine similarity from pred emb to embeds of concat sentences:", cosine_metric(y_dev, model.predict(x_dev)))

# def pred_emb_nn_ranking(df, idx, n=5000):
#     inp = [df.embedding.iloc[idx]]
#     out = model.predict(inp)
#     pred_emb = out[0]
#     return annoy_index.get_nns_by_vector(pred_emb, n=len(df_exp))

In [14]:
from sklearn import feature_extraction, metrics

def get_tfidf_ranking(df, field_q="q_reformat", field_e="text"):
    # "q_reformat" instead of "Question"      MAP +0.07 (0.24 -> 0.31)
    # tfidf stop_words="english"              MAP +0.01 (0.31 -> 0.32)
    # field_q and field_e = "lemmas"          MAP +0.08 (0.32 -> 0.40)
    # tfidf binary=True                       MAP +0.03 (0.40 -> 0.43)
    # tfidf ngram_range=(1,2)                 MAP -0.05 (0.43 -> 0.38)
    # tfidf ngram_range=(1,3)                 MAP -0.07 (0.43 -> 0.36)
    
    def preprocess(lst):
        if type(lst[0]) == str:
            return lst
        elif type(lst[0]) == list:
            return [" ".join(sublst) for sublst in lst]
        else:
            raise TypeError("unknown data type")
    
    ranking = []
    q = preprocess(df[field_q].tolist())
    e = preprocess(df_exp[field_e].tolist())
    
    vectorizer = feature_extraction.text.TfidfVectorizer(
        stop_words="english", binary=True)
    vectorizer.fit(q + e)
    X_q = vectorizer.transform(q)
    X_e = vectorizer.transform(e)
    X_dist = metrics.pairwise.cosine_distances(X_q, X_e)
    
    for i_question, distances in enumerate(X_dist):
        ranking.append([])
        for i_explanation in np.argsort(distances):
            ranking[-1].append(i_explanation)

    return ranking

def add_missing_idxs(old):
    set_old = set(old)
    set_all = set(np.arange(0, len(df_exp)))
    missing = list(set_all - set_old)
    np.random.shuffle(missing)
    new = list(old)
    new.extend(missing)
    assert len(new) == len(df_exp), (len(new), len(df_exp), len(missing))
    assert all([a == b  for a, b in zip(new[:len(old)], old)])
    assert set(new) == set_all
    return new

from tqdm import tqdm

def format_predict_line(questionID, explanation_uid):
    # Adapted from tfidf baseline script
    return "{}\t{}".format(questionID, explanation_uid)

def get_preds(df, df_exp):
    preds = []
    tfidf_ranking = get_tfidf_ranking(df)
    tfidf_ranking = get_tfidf_ranking(df, "lemmas", "lemmas")
    
    for i in tqdm(range(len(df))):
        
#         # Repeating nn algo (23.7)
#         nearest = df.nn_exp.iloc[i]
#         scores = repeat(nearest)
#         ranked_idxs = sorted(list(scores.keys()), key=lambda idx: scores[idx], reverse=True)
        
#         # Recursive nn algo (0.07)
#         nearest = df.nn_exp.iloc[i]
#         scores = {}
#         recurse(seed=nearest, scores=scores, iter=0)
#         ranked_idxs = sorted(list(scores.keys()), key=lambda idx: scores[idx], reverse=True)
        
#         # Simple nn algo (0.24)
#         ranked_idxs = simple_nn_ranking(df, i)
        
#         # Predict concat answer embedding algo (0.17)
#         ranked_idxs = pred_emb_nn_ranking(df, i)

        # Tfidf
        ranked_idxs = tfidf_ranking[i]

        if len(ranked_idxs) < len(df_exp):
            ranked_idxs = add_missing_idxs(ranked_idxs)
        
        questionID = df.questionID.iloc[i]
        for idx in ranked_idxs:
            preds.append(format_predict_line(questionID, df_exp.uid.iloc[idx]))
    return preds
        
def test_get_preds():
    preds = get_preds(df_dev[:10], df_exp)
    print("Num preds:", len(preds))
    print("Prediction lines:", preds)
    
test_get_preds()

def write_preds(preds, path="predict.txt"):
    with open(path, "w") as f:
        f.write("\n".join(preds))
        
def test_write_preds():
    preds = ['VASoL_2008_3_26\t14de-6699-6b2e-a5d1', 'VASoL_2008_3_26\t14de-6699-6b2e-a5d1']
    path = "temp.txt"
    write_preds(preds, path)
    with open(path) as f:
        for line in f:
            print(repr(line))
            
test_write_preds()

write_preds(get_preds(df_trn, df_exp))  # for dev set
# write_preds(get_preds(df_dev, df_exp))  # for dev set
# write_preds(get_preds(df_dev, df_exp))  # for test set
!cd {path_data} && python ../evaluate.py --gold=questions/ARC-Elementary+EXPL-Train.tsv /content/predict.txt
# !cd {path_data} && python ../evaluate.py --gold=questions/ARC-Elementary+EXPL-Dev.tsv /content/predict.txt

100%|██████████| 10/10 [00:00<00:00, 10.47it/s]


Num preds: 49470
Prediction lines: ['VASoL_2008_3_26\t4bc1-99e4-3bcc-aac0', 'VASoL_2008_3_26\t9349-71b6-e9b8-fd21', 'VASoL_2008_3_26\t472e-182d-7263-16c0', 'VASoL_2008_3_26\td723-5450-7a62-d57c', 'VASoL_2008_3_26\td8cd-c383-3563-057f', 'VASoL_2008_3_26\t98b6-2bc9-0f82-84f9', 'VASoL_2008_3_26\t47ce-3a56-5343-bea8', 'VASoL_2008_3_26\te973-4320-00d7-da10', 'VASoL_2008_3_26\t762f-5f24-90ad-ff00', 'VASoL_2008_3_26\t7854-76a6-6501-4b8f', 'VASoL_2008_3_26\t62ff-bf8f-6b81-d3f1', 'VASoL_2008_3_26\t8fd4-1c07-b659-3ae7', 'VASoL_2008_3_26\t67f8-8d07-5f59-bdd7', 'VASoL_2008_3_26\t2a84-8168-a68d-adac', 'VASoL_2008_3_26\t8d77-fa93-75df-6ae0', 'VASoL_2008_3_26\tf29c-3e66-5c51-98e8', 'VASoL_2008_3_26\t2ead-14dd-aede-6d07', 'VASoL_2008_3_26\t9554-47d7-c095-1df7', 'VASoL_2008_3_26\t284d-e19c-6811-6a99', 'VASoL_2008_3_26\te00a-03fe-d978-1a27', 'VASoL_2008_3_26\t587b-cf2b-5466-f25a', 'VASoL_2008_3_26\tf253-a9f3-3688-96a7', 'VASoL_2008_3_26\tb32c-707d-1991-3722', 'VASoL_2008_3_26\tae74-32dc-4291-fb82', 'VAS

100%|██████████| 1190/1190 [01:52<00:00, 10.61it/s]


mercury_sc_405198 0.07499174876476962
mercury_sc_405298 0.32576440095942727
vasol_2007_3_6 0.11245022901906565
mercury_sc_416525 0.019395206794441296
vasol_2010_3_38 0.17010490903714073
mercury_sc_405207 0.5843621237927058
mercury_sc_415703 0.11033004214685628
vasol_2009_3_3 0.6911300505050505
vasol_2009_3_13 0.14945551990209122
mercury_sc_415014 0.39653331268711317
vasol_2008_3_14 0.9166666666666666
mercury_sc_407137 0.012818385627681167
mercury_sc_406702 0.2976799956556588
mercury_sc_416518 0.05562770562770563
vasol_2008_3_24 0.6504135229849516
mercury_sc_413304 1.0
mercury_sc_417556 0.17043723894306
mercury_sc_413141 0.6143790849673202
vasol_2009_3_36 0.14138146010249814
mercury_sc_415349 0.2833771752703961
vasol_2009_3_10 0.035469099881921685
vasol_2007_3_34 0.7777777777777778
vasol_2010_3_17 0.6984126984126983
mercury_sc_413078 0.5874358974358974
vasol_2007_3_16 0.3748663101604278
mercury_sc_413531 0.10325695460235192
vasol_2007_3_5 0.45
mercury_sc_416531 0.560752688172043
mercury