# GET BERT DATA

In [2]:
import pandas as pd
import numpy as np
import pickle

ModuleNotFoundError: No module named 'pandas'

## CONTENT

1. Create "sense sentences" for each available sense. The sentence should correctly make use of the sense.
2. Create training samples, where each data sample generates two training samples. 1) sample where a sample is paired with a correct sense sentence, 2) where a sample is paired with an incorrect sense sentence.
3. Save the data.

### In another file
4. Obtain a pre-trained BERT.
5. Fine-tune BERT on the new data.

## 1. Create sense sentences

In [None]:
filename = "/Users/lovhag/Projects/dl4nlp_assignment_1/a1_data/wsd_train.txt"
data = pd.read_table(filename,header=None,names=['sense_key', 'lemma', 'word_position', 'text'])
data.head()

In [None]:
data.info()

In [None]:
lemma_list = data.lemma.unique()
sense_dict = {lemma: list(data[data.lemma==lemma].sense_key.unique()) for lemma in lemma_list}

In [None]:
total_nbr_of_senses = sum([len(sense_dict[key]) for key in sense_dict])
print(f"Total number of available senses: {total_nbr_of_senses}")

In [None]:
sentence_per_sense = {sense: data[data.sense_key == sense].text.iloc[0] for sense in list(data.sense_key.unique())}
sentence_per_lemma_sense = {lemma: {sense: data[data.sense_key == sense].text.iloc[0] for sense in list(data[data.lemma==lemma].sense_key.unique())} for lemma in list(data.lemma.unique())}

In [None]:
sentence_per_sense["keep%2:42:07::"]

## 2. Split into training and testing data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_indices, test_indices = train_test_split(range(len(data)), test_size=.2, random_state=42)
train_data = data.iloc[train_indices].copy()
test_data = data.iloc[test_indices].copy()

## 3. Create training samples

In [None]:
def create_sentence_pair_data(data):
    X_data_1 = [] # pairs!
    X_data_2 = []
    y_data = []
    def add_data_entry(row, sense_key, label):
        two_sentences = []
        X_data_1.append(row.text)
        X_data_2.append(sentence_per_lemma_sense[row.lemma][sense_key])
        y_data.append(label)

    for index, row in data.iterrows():
        add_data_entry(row, row.sense_key, 1)

        faulty_senses = list(sentence_per_lemma_sense[row.lemma].keys())
        faulty_senses.remove(row.sense_key)
        faulty_sense_key = np.random.choice(faulty_senses)
        add_data_entry(row, faulty_sense_key, 0)
    return X_data_1, X_data_2, y_data

In [None]:
X_data_1, X_data_2, y_data = create_sentence_pair_data(train_data)

In [None]:
X_data_1[0]

In [None]:
print(f"Number of data samples for training: {len(X_data_1)}")

In [None]:
def save_data_with_pickle(data_dict):
    pre_filename = input(f"Specify which prefix filename you wish to save {list(data_dict.keys())} to: ")
    if pre_filename:
        for key, value in data_dict.items():
            filename = pre_filename+"_"+key+".pickle"
            with open(filename, "wb") as fp:   #Pickling
                pickle.dump(value, fp)

In [None]:
save_data_with_pickle({"X_data_1_train": X_data_1, "X_data_2_train": X_data_2, "y_data_train": y_data})

## 4. Create testing samples

In [None]:
X_data_1, X_data_2, y_data = create_sentence_pair_data(test_data)
print(f"Number of data samples for testing: {len(X_data_1)}")

In [None]:
save_data_with_pickle({"X_data_1_test": X_data_1, "X_data_2_test": X_data_2, "y_data_test": y_data})

### Create samples for evaluation

In [None]:
def create_evaluation_sentence_pair_data(data):
    evaluation_data = {}
    def add_data_entry(row, sense_key):
        two_sentences = []
        X_data_1.append(row.text)
        X_data_2.append(sentence_per_lemma_sense[row.lemma][sense_key])

    for index, row in data.iterrows():
        X_data_1 = [] # pairs!
        X_data_2 = []
        y_data = [0]

        add_data_entry(row, row.sense_key)

        faulty_senses = list(sentence_per_lemma_sense[row.lemma].keys())
        faulty_senses.remove(row.sense_key)
        for faulty_sense_key in faulty_senses:
            add_data_entry(row, faulty_sense_key)
        
        if row.lemma in evaluation_data:
            evaluation_data[row.lemma]["X_data_1"].append(X_data_1)
            evaluation_data[row.lemma]["X_data_2"].append(X_data_2)
            evaluation_data[row.lemma]["y_data"].append(y_data)
        else:
            evaluation_data[row.lemma] = {}
            evaluation_data[row.lemma]["X_data_1"] = X_data_1
            evaluation_data[row.lemma]["X_data_2"] = X_data_2
            evaluation_data[row.lemma]["y_data"] = y_data

    return evaluation_data

In [None]:
evaluation_data = create_evaluation_sentence_pair_data(test_data)
print(f"Lemmas to evaluate for:")
print(evaluation_data.keys())
print("")
nbr_of_evaluation_samples_per_lemma = {lemma: len(evaluation_data[lemma]["X_data_1"]) for lemma in evaluation_data.keys()}
print(f"Evaluation samples per lemma:")
print(nbr_of_evaluation_samples_per_lemma)

In [None]:
save_data_with_pickle({"evaluation_data": evaluation_data})