In [None]:
import sys
sys.path.append(sys.path[0] + "/..")

In [None]:
import json
import os
import properties
import pymysql
import random
import utils

import pandas as pd 

random.seed(10)

In [229]:
cur.close()
conn.close()


### FEVER data preparation

In [230]:
# FEVER Wikipedia DB

mysql_host = os.getenv("HOST", "localhost")
mysql_port = int(os.getenv("PORT", "3306"))
mysql_user = os.getenv("USER", "root")
mysql_pass = os.getenv("PASS", "DimeMP01!")
mysql_db = os.getenv("DB", "fever")

conn = pymysql.connect(host="localhost", port=3306, user="root", password="DimeMP01!", db="fever")
CUR = conn.cursor()

In [231]:
def _get_wiki_evidence(wiki_id: str, sent_idx: int) -> str:
    query = "SELECT * FROM fever.wiki_pages where id='{}'".format(wiki_id)
    CUR.execute(query)
    
    try:
        entry = next(CUR)
        texts = entry[1].split("\n")
        text = texts[sent_idx].split("\t")[1]
        return text
    except Exception as e:
        print("No entry in wiki database for id = {}".format(wiki_id))
    

In [151]:
def _load_wiki_db(wiki_db_path = "/Users/user/Downloads/wiki-pages/wiki-pages-jsons"):
    wiki_entries = {}
    for file in os.listdir(wiki_db_path):
        with open(os.path.join(wiki_db_path, file)) as f:
            entry = json.load(f)
            wiki_entries[entry['id']] = entry['lines']
            
    return wiki_entries

# wiki_db = _load_wiki_db()
wiki_db = pd.read_csv("/Users/user/Downloads/wiki-pages/wiki-pages.csv")

In [233]:
wiki_db.head()

Unnamed: 0.1,Unnamed: 0,id,text,lines
0,0,Snakebite_-LRB-album-RRB-,Snakebite is the first official release by the...,0\tSnakebite is the first official release by ...
1,1,Sin_Sukju,"Sin Suk-ju -LRB- Korean : 신숙주 , hanja : 申叔舟 ; ...","0\tSin Suk-ju -LRB- Korean : 신숙주 , hanja : 申叔舟..."
2,2,"South_Oroville,_California",South Oroville is a census-designated place -L...,0\tSouth Oroville is a census-designated place...
3,3,Southwest_Golf_Classic,The Southwest Golf Classic was a PGA Tour even...,0\tThe Southwest Golf Classic was a PGA Tour e...
4,4,"St._Philip's_Cathedral,_San_Felipe",The St. Philip 's Cathedral -LRB- Catedral de ...,0\tThe St. Philip 's Cathedral -LRB- Catedral ...


In [287]:
# load fever training data
fever_train_path = "/Users/user/Library/CloudStorage/OneDrive-King'sCollegeLondon/PycharmProjects/fc-evidence-evaluation/data/fever/paper_dev.jsonl"
train = utils.load_jsonl_file(fever_train_path)

In [288]:
# filter entries with more than one evidence set
train_multi_evidence = [entry for entry in train if len(entry['evidence'])>1]

In [289]:
print("Initial entries: {}".format(len(train)))
print("Entries with more than one evidence sets: {}".format(len(train_multi_evidence)))


Initial entries: 9999
Entries with more than one evidence sets: 1832


In [290]:
POS_SAMPLE = {
    "claim": "",
    "reference": "",
    "target": "",
    "score": 1,
    "label": None
}
_EVIDENCE_FORMAT = "Title: {}; {}"

def _get_wiki_evidence(wiki_id: str, sentence_idx: int): 
    entry = wiki_db[wiki_db.id==wiki_id]
    sentences = list(entry.to_dict()["lines"].values())[0]
    sentence = sentences.split("\n")[sentence_idx]
    return sentence.split("\t")[1]
    

def _get_evidence_text(evidence_set: list):
    evidence_text = []
    for evidence in evidence_set:
        sentence = _get_wiki_evidence(evidence[2], evidence[3])
        evidence_text.append(_EVIDENCE_FORMAT.format(evidence[2], sentence))
        
    return " ".join(evidence_text)

def _format_claim_evidence(claim, evidence_set):
    return claim + " " + _get_evidence_text(evidence_set)


In [291]:
# prepare training set of POSITIVE samples (similarity score == 1)
bleurt_training_data = []

for entry in train_multi_evidence: 
    try:
        claim = entry['claim']
        targets = []

        for i in range(len(entry["evidence"])-1):
            reference = _format_claim_evidence(entry['claim'], entry["evidence"][i])
            target = _format_claim_evidence(entry['claim'], entry["evidence"][i+1])
            if target != reference:
                new_entry = POS_SAMPLE.copy()
                new_entry['claim'] = entry['claim']
                new_entry['reference'] = reference
                new_entry['target'] = target
                new_entry['label'] = entry['label']
                bleurt_training_data.append(new_entry)
    except Exception as e: 
#         print("Exception for claim: {}".format(claim))
        continue

print("For {} multi evidence entries, {} bleurt training samples created".format(len(train_multi_evidence), 
                                                                                 len(bleurt_training_data)))

For 1832 multi evidence entries, 2527 bleurt training samples created


In [292]:
# prepare training set of NEGATIVE samples (similarity score == 1)
bleurt_training_data_neg_samples = []

for entry in bleurt_training_data: 
    claim = entry['claim']
    reference = entry['reference']
    
    while True:
        # randomly select target from another entry which is for a different claim 
        rand_entry = random.sample(bleurt_training_data, 1)[0]
        if rand_entry['claim'] != claim:
            break
    
    new_entry = POS_SAMPLE.copy()
    new_entry['claim'] = claim
    new_entry['reference'] = reference
    new_entry['target'] = claim + " " + " ".join(rand_entry["reference"].split(". ")[1:])
    new_entry['score'] = 0
    bleurt_training_data_neg_samples.append(new_entry)

print("{} negative bleurt training samples created".format(len(bleurt_training_data_neg_samples)))  

2527 negative bleurt training samples created


In [293]:
# Extend bleurt trianing data with negative samples
bleurt_training_data.extend(bleurt_training_data_neg_samples)
print(len(bleurt_training_data))

5054


In [294]:
# save bleurt training data as jsonl file
def to_dict(obj):
    return json.loads(json.dumps(obj, default=lambda o: o.__dict__))

def save_jsonl_file(data, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for entry in data:
            json.dump(to_dict(entry), f)
            f.write("\n")

training_data_path = "/Users/user/Library/CloudStorage/OneDrive-King'sCollegeLondon/PycharmProjects/fc-evidence-evaluation/data/reference_scorer_training_data"

save_jsonl_file(bleurt_training_data, os.path.join(training_data_path, "fever_dev_based.jsonl"))


### Merge all files for BLEURT finetuning 

In [295]:
# load all training data and merge 
path = "/Users/user/Library/CloudStorage/OneDrive-King'sCollegeLondon/PycharmProjects/fc-evidence-evaluation/data/reference_scorer_training_data"
output_filename = "bleurt_finetune_train.jsonl"
data = []

for file in os.listdir(path):
    if 'train' in file:
        data.extend(utils.load_jsonl_file(os.path.join(path, file)))
            
save_jsonl_file(data, os.path.join(path, output_filename))

In [301]:
# load all test data and merge 
def _merge_files(split: str):
    # load all training data and merge 
    path = "/Users/user/Library/CloudStorage/OneDrive-King'sCollegeLondon/PycharmProjects/fc-evidence-evaluation/data/reference_scorer_training_data"
    output_filename = "bleurt_finetune_{}.jsonl".format(split)
    data = []

    for file in os.listdir(path):
        if "fever_{}".format(split) in file:
            data.extend(utils.load_jsonl_file(os.path.join(path, file)))

    save_jsonl_file(data, os.path.join(path, output_filename))

In [303]:
# _merge_files("train")
# _merge_files("test")
_merge_files("dev")