# Dialogue and Narrative Coursework - Subtask 1 - Cosine Similarity

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
import ast
new_val_data = pd.read_csv('doc2dial_rc_val.csv')

[nltk_data] Downloading package punkt to /home/matt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
validate_full = True # set False to only validate on teh same contexts as training data

In [3]:
def train_predict_doc2vec(dataset, index):
  instance = dataset.iloc[index]
  docs = ast.literal_eval(instance['spans'])

  tokenized_docs = {}
  for i, d in docs.items():
      tokenized_docs[int(i)] = word_tokenize(d.lower())

  tagged_data = []
  for i, d in tokenized_docs.items():
    tagged_data.append(TaggedDocument(d, [i]))

  model = Doc2Vec(tagged_data, vector_size=80, window=5, min_count=1, workers=-1, epochs = 100)

  test_doc = instance['question'][5:]

  predictions = model.docvecs.most_similar(positive=[model.infer_vector(test_doc)], topn=30)
  # print(predictions)

  ids = []
  count = 0
  sum_score = 0
  pred = 0
  tol_zone = 5
  while True:

    id, score = predictions[pred]
    ids.append(id)
    count += 1
    sum_score += score

    pred += 1
    next_id, next_score = predictions[pred]

    possible_next_ids = list(range(id-tol_zone, id+tol_zone+1))
    # print(possible_next_ids)
    if next_id not in possible_next_ids:
      break

  sorted_ids = sorted(ids)
  text = ''
  for i in sorted_ids:
    text += docs.get(str(i), '')

  final_score = sum_score / count
  return {"id":instance["id"], "prediction_text":text, "no_answer_probability":1.0-final_score}

In [4]:
# get inds of contexts not in the training set
from word_counts import word_counter
import utils
def get_context_only():
    train_data1 = utils.load_own_rc_data(split="train")
    val_data1 = utils.load_own_rc_data(split="validation")

    counter = word_counter(train_data1)
    skip_inds = counter.no_context_availble(val_data1)

    good_data_inds = list(range(len(val_data1)))
    for ind in skip_inds:
        good_data_inds.remove(ind)
    return good_data_inds
good_data_inds = get_context_only()
    
# decide what data to validate
if validate_full:
    inds_to_val = range(len(new_val_data[:]))# all data use good_data_inds for context same as training
else:
    inds_to_val = good_data_inds

In [5]:
from tqdm import tqdm

preds = []
# new_val_data_slice = new_val_data[:100]
new_val_data_slice = new_val_data[:]

for i in tqdm(inds_to_val):
  preds.append(train_predict_doc2vec(new_val_data_slice, i))

preds

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3972/3972 [04:45<00:00, 13.89it/s]


[{'id': 'dea7174409afbfe0af0ace21e7f318ae_1',
  'prediction_text': 'Also , Because we all pay indirectly for crashes involving uninsured motorists , ',
  'no_answer_probability': 0.8130782395601273},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_3',
  'prediction_text': 'you will miss a suspension order and may be charged with operating an unregistered vehicle and/or aggravated unlicensed operation, both misdemeanors. ',
  'no_answer_probability': 0.7683570832014084},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_5',
  'prediction_text': 'About ten percent of customers visiting a DMV office do not bring what they need to complete their transaction, and have to come back a second time to finish their business. ',
  'no_answer_probability': 0.777058482170105},
 {'id': 'dea7174409afbfe0af0ace21e7f318ae_7',
  'prediction_text': 'A license suspension or revocation here could mean that your new home state will not issue you a license there. ',
  'no_answer_probability': 0.690319836139679},
 {'id': '

In [6]:
import json

file = 'predictions_subtask1_cosine_simple.json'
with open(file, 'w') as outfile:
    json.dump(preds, outfile)
    
    
import os
cmd = 'python sharedtask_utils.py --task subtask1 --prediction_json '+file
os.system(cmd)

Reusing dataset doc2dial (/home/matt/.cache/huggingface/datasets/doc2dial/doc2dial_rc/1.0.1/cf6d3ed4e77cea477387dd51c171a021a09bd314cf3a2cb2a6431ca738c6c0ee)
2022-01-10 12:15:15.214499: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-10 12:15:15.214519: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


predictions_subtask1_cosine_simple.json
{'exact': 0.8811681772406847, 'f1': 12.45214629361504, 'total': 3972, 'HasAns_exact': 0.8811681772406847, 'HasAns_f1': 12.45214629361504, 'HasAns_total': 3972, 'best_exact': 0.8811681772406847, 'best_exact_thresh': 0.8607330322265625, 'best_f1': 12.457181540342162, 'best_f1_thresh': 1.0079312324523926}


0

In [7]:
# good: {'exact': 0.9841443411700382, 'f1': 12.194171863834752, 'total': 1829, 'HasAns_exact': 0.9841443411700382, 'HasAns_f1': 12.194171863834752, 'HasAns_total': 1829, 'best_exact': 0.9841443411700382, 'best_exact_thresh': 0.9146314263343811, 'best_f1': 12.194171863834761, 'best_f1_thresh': 0.922089159488678}
# all:  {'exact': 1.2588116817724069, 'f1': 12.926781781634116, 'total': 3972, 'HasAns_exact': 1.2588116817724069, 'HasAns_f1': 12.926781781634116, 'HasAns_total': 3972, 'best_exact': 1.2588116817724069, 'best_exact_thresh': 0.8560804128646851, 'best_f1': 12.92678178163413, 'best_f1_thresh': 0.9933457970619202}
# all2: {'exact': 0.9566968781470292, 'f1': 12.527961387030402, 'total': 3972, 'HasAns_exact': 0.9566968781470292, 'HasAns_f1': 12.527961387030402, 'HasAns_total': 3972, 'best_exact': 0.9566968781470292, 'best_exact_thresh': 0.8959553837776184, 'best_f1': 12.527961387030405, 'best_f1_thresh': 0.9773053526878357}