# GET BERT DATA

In [2]:
import pandas as pd
import numpy as np
import pickle

## CONTENT

1. Create "sense sentences" for each available sense. The sentence should correctly make use of the sense.
2. Create training samples, where each data sample generates two training samples. 1) sample where a sample is paired with a correct sense sentence, 2) where a sample is paired with an incorrect sense sentence.
3. Save the data.

### In another file
4. Obtain a pre-trained BERT.
5. Fine-tune BERT on the new data.

## 1. Create sense sentences

In [3]:
filename = "/Users/lovhag/Projects/dl4nlp_assignment_1/a1_data/wsd_train.txt"
data = pd.read_table(filename,header=None,names=['sense_key', 'lemma', 'word_position', 'text'])
data.head()

Unnamed: 0,sense_key,lemma,word_position,text
0,keep%2:42:07::,keep.v,15,Action by the Committee In pursuance of its ma...
1,national%3:01:00::,national.a,25,A guard of honour stood in formation in honour...
2,build%2:31:03::,build.v,38,The principle that statistics should be timely...
3,place%1:04:00::,place.n,36,"Again , he appealed for additional support for..."
4,position%1:04:01::,position.n,76,"Also , the IAEA has the lowest number of women..."


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76049 entries, 0 to 76048
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   sense_key      76049 non-null  object
 1   lemma          76049 non-null  object
 2   word_position  76049 non-null  int64 
 3   text           76049 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
lemma_list = data.lemma.unique()
sense_dict = {lemma: list(data[data.lemma==lemma].sense_key.unique()) for lemma in lemma_list}

In [6]:
total_nbr_of_senses = sum([len(sense_dict[key]) for key in sense_dict])
print(f"Total number of available senses: {total_nbr_of_senses}")

Total number of available senses: 222


In [7]:
sentence_per_sense = {sense: data[data.sense_key == sense].text.iloc[0] for sense in list(data.sense_key.unique())}
sentence_per_lemma_sense = {lemma: {sense: data[data.sense_key == sense].text.iloc[0] for sense in list(data[data.lemma==lemma].sense_key.unique())} for lemma in list(data.lemma.unique())}

In [8]:
sentence_per_sense["keep%2:42:07::"]

'Action by the Committee In pursuance of its mandate , the Committee will continue to keep under review the situation relating to the question of Palestine and participate in relevant meetings of the General Assembly and the Security Council . The Committee will also continue to monitor the situation on the ground and draw the attention of the international community to urgent developments in the Occupied Palestinian Territory , including East Jerusalem , requiring international action .'

## 2. Split into training and testing data

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_indices, test_indices = train_test_split(range(len(data)), test_size=.2, random_state=42)
train_data = data.iloc[train_indices].copy()
test_data = data.iloc[test_indices].copy()

## 3. Create training samples

In [11]:
def create_sentence_pair_data(data):
    X_data_1 = [] # pairs!
    X_data_2 = []
    y_data = []
    def add_data_entry(row, sense_key, label):
        two_sentences = []
        X_data_1.append(row.text)
        X_data_2.append(sentence_per_lemma_sense[row.lemma][sense_key])
        y_data.append(label)

    for index, row in data.iterrows():
        add_data_entry(row, row.sense_key, 1)

        faulty_senses = list(sentence_per_lemma_sense[row.lemma].keys())
        faulty_senses.remove(row.sense_key)
        faulty_sense_key = np.random.choice(faulty_senses)
        add_data_entry(row, faulty_sense_key, 0)
    return X_data_1, X_data_2, y_data

In [12]:
X_data_1, X_data_2, y_data = create_sentence_pair_data(train_data)

In [13]:
X_data_1[0]

'( vii ) BGL - General Trust Fund for the Core Programme Budget for the Biosafety Protocol , which is extended through 31 December 2011 ; ( viii ) BHL - Special Voluntary Trust Fund for Additional Voluntary Contributions in Support of Approved Activities of the Biosafety Protocol , which is extended through 31 December 2011 ; ( ix ) BTL - General Trust Fund for the Conservation of European Bats ( EUROBATS ) , which is extended through 31 December 2014 ;'

In [14]:
print(f"Number of data samples for training: {len(X_data_1)}")

Number of data samples for training: 121678


In [15]:
def save_data_with_pickle(data_dict):
    pre_filename = input(f"Specify which prefix filename you wish to save {list(data_dict.keys())} to: ")
    if pre_filename:
        for key, value in data_dict.items():
            filename = pre_filename+"_"+key+".pickle"
            with open(filename, "wb") as fp:   #Pickling
                pickle.dump(value, fp)

In [17]:
save_data_with_pickle({"X_data_1_train": X_data_1, "X_data_2_train": X_data_2, "y_data_train": y_data})

## 4. Create testing samples

In [18]:
X_data_1, X_data_2, y_data = create_sentence_pair_data(test_data)
print(f"Number of data samples for testing: {len(X_data_1)}")

Number of data samples for testing: 30420


In [19]:
save_data_with_pickle({"X_data_1_test": X_data_1, "X_data_2_test": X_data_2, "y_data_test": y_data})

### Create samples for evaluation

In [20]:
def create_evaluation_sentence_pair_data(data):
    evaluation_data = {}
    def add_data_entry(row, sense_key):
        two_sentences = []
        X_data_1.append(row.text)
        X_data_2.append(sentence_per_lemma_sense[row.lemma][sense_key])

    for index, row in data.iterrows():
        X_data_1 = [] # pairs!
        X_data_2 = []
        y_data = [0]

        add_data_entry(row, row.sense_key)

        faulty_senses = list(sentence_per_lemma_sense[row.lemma].keys())
        faulty_senses.remove(row.sense_key)
        for faulty_sense_key in faulty_senses:
            add_data_entry(row, faulty_sense_key)
        
        if row.lemma in evaluation_data:
            evaluation_data[row.lemma]["X_data_1"].append(X_data_1)
            evaluation_data[row.lemma]["X_data_2"].append(X_data_2)
            evaluation_data[row.lemma]["y_data"].append(y_data)
        else:
            evaluation_data[row.lemma] = {}
            evaluation_data[row.lemma]["X_data_1"] = X_data_1
            evaluation_data[row.lemma]["X_data_2"] = X_data_2
            evaluation_data[row.lemma]["y_data"] = y_data

    return evaluation_data

In [24]:
evaluation_data = create_evaluation_sentence_pair_data(test_data)
print(f"Lemmas to evaluate for:")
print(evaluation_data.keys())
print("")
nbr_of_evaluation_samples_per_lemma = {lemma: len(evaluation_data[lemma]["X_data_1"]) for lemma in evaluation_data.keys()}
print(f"Evaluation samples per lemma:")
print(nbr_of_evaluation_samples_per_lemma)

Lemmas to evaluate for:
dict_keys(['lead.v', 'extend.v', 'regular.a', 'see.v', 'position.n', 'find.v', 'force.n', 'hold.v', 'build.v', 'serve.v', 'keep.v', 'bad.a', 'national.a', 'point.n', 'order.n', 'time.n', 'physical.a', 'professional.a', 'place.n', 'case.n', 'line.n', 'security.n', 'follow.v', 'common.a', 'critical.a', 'positive.a', 'life.n', 'bring.v', 'major.a', 'active.a'])

Evaluation samples per lemma:
{'lead.v': 524, 'extend.v': 528, 'regular.a': 392, 'see.v': 1337, 'position.n': 446, 'find.v': 462, 'force.n': 556, 'hold.v': 630, 'build.v': 525, 'serve.v': 610, 'keep.v': 1081, 'bad.a': 356, 'national.a': 451, 'point.n': 419, 'order.n': 403, 'time.n': 405, 'physical.a': 387, 'professional.a': 377, 'place.n': 382, 'case.n': 469, 'line.n': 1129, 'security.n': 433, 'follow.v': 664, 'common.a': 358, 'critical.a': 310, 'positive.a': 260, 'life.n': 419, 'bring.v': 496, 'major.a': 311, 'active.a': 282}


In [25]:
save_data_with_pickle({"evaluation_data": evaluation_data})