# BERT model

# STRUCTURE
1. Load the training data.
2. Adapt data to BERT
3. Obtain a pretrained BERT and fine-tune.
4. Evaluate BERT results.

## 1. Load the training data

In [5]:
import pickle
from sklearn.model_selection import train_test_split
import os

In [2]:
DIRECTORY_NAME = input("Specify the directory you wish to save ALL the data to: ")
os.makedirs(DIRECTORY_NAME)

FileExistsError: [Errno 17] File exists: 'BERT_data_1'

In [6]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

In [7]:
X_data_1 = load_pickle_data("saved_data/BERT_X_data_1.pickle")
X_data_2 = load_pickle_data("saved_data/BERT_X_data_2.pickle")
y_data = load_pickle_data("saved_data/BERT_y_data.pickle")

In [8]:
print(f"Number of data samples: {len(y_data)}")

Number of data samples: 152098


In [9]:
X_data_1[1]

'Action by the Committee In pursuance of its mandate , the Committee will continue to keep under review the situation relating to the question of Palestine and participate in relevant meetings of the General Assembly and the Security Council . The Committee will also continue to monitor the situation on the ground and draw the attention of the international community to urgent developments in the Occupied Palestinian Territory , including East Jerusalem , requiring international action .'

In [10]:
X_data_2[1]

'The Committee notes the launch of the National Education Campaign by the Drug Enforcement Commission but remains concerned at the practice of substance abuse by children , the lack of statistics on this issue and the limited capacities of the specific institutions in the State party to treat drug-addicted children . The Committee recommends that the State party strengthen its efforts to prevent substance abuse by children , giving particular attention to vulnerable groups ; make additional efforts to monitor the incidence of substance abuse and to keep accurate statistics on the phenomenon ; and develop mechanisms and structures through which assistance , including health and rehabilitative assistance , can be provided to children who abuse substances . Street children'

In [11]:
train_X_data_1, val_X_data_1, train_X_data_2, val_X_data_2, train_y_data, val_y_data = train_test_split(X_data_1, X_data_2, y_data, test_size=.2)

In [12]:
len(train_X_data_1)

121678

## 2. Adapt data to BERT

In [13]:
import torch
from transformers import BertTokenizer

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_X_data_1, train_X_data_2, truncation=True, padding=True)
val_encodings = tokenizer(val_X_data_1, val_X_data_2, truncation=True, padding=True)

In [15]:
class SenseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SenseDataset(train_encodings, train_y_data)
val_dataset = SenseDataset(val_encodings, val_y_data)

In [16]:
def save_data_with_pickle(data_dict, folder_name=None):
    if not folder_name:
        folder_name = input(f"Specify which prefix filename you wish to save {list(data_dict.keys())} to: ")
    if folder_name:
        for key, value in data_dict.items():
            filename = folder_name+"/"+key+".pickle"
            with open(filename, "wb") as fp:   #Pickling
                pickle.dump(value, fp)

In [17]:
save_data_with_pickle({"train_encodings": train_encodings, "val_encodings": val_encodings}, DIRECTORY_NAME)

In [20]:
save_data_with_pickle({"train_y_data": train_y_data, "val_y_data": val_y_data}, DIRECTORY_NAME)

In [21]:
save_data_with_pickle({"train_dataset": train_dataset, "val_dataset": val_dataset}, DIRECTORY_NAME)

## 3. Obtain a pretrained BERT and fine-tune

In [26]:
def load_pickle_data(filename):
    with open(filename, "rb") as load_file:
        return pickle.load(load_file)

train_dataset = load_pickle_data(DIRECTORY_NAME+"/train_dataset.pickle")
val_dataset = load_pickle_data(DIRECTORY_NAME+"/val_dataset.pickle")

In [23]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir=DIRECTORY_NAME+'/BERT_results_1',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=DIRECTORY_NAME+'/BERT_logs_1',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir=DIRECTORY_NAME+'/BERT_results_1',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=DIRECTORY_NAME+'/BERT_logs_1',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

trainer.save_model(DIRECTORY_NAME+'/BERT_model_1')

In [None]:
from transformers import BertForNextSentencePrediction, Trainer, TrainingArguments

model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True)

# keep the weights of the pre-trained encoder frozen and optimize only the weights of the head layers
for param in model.base_model.parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir=DIRECTORY_NAME+'/BERT_results_2',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=DIRECTORY_NAME+'/BERT_logs_2',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

trainer.save_model(DIRECTORY_NAME+'/BERT_model_2')

In [None]:
trainer.save_model(DIRECTORY_NAME)
tokenizer.save_pretrained(DIRECTORY_NAME)

## 4. Check the results