In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import sys
import os
import torch.utils.data
sys.path.append('../')
#import experiments.config as C
from transformers import *


import time
import json
import numpy as np
from torch import optim


import shutil
from sklearn.metrics import f1_score
import warnings
import copy

import random




In [2]:
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
print(device)

cuda:0


In [3]:
torch.manual_seed(3)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(7)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
warnings.filterwarnings('ignore')


random.seed(3)
np.random.seed(3)

In [4]:


class BaseAttention(nn.Module):
    def __init__(self, dimension):
        super(BaseAttention, self).__init__()

        self.u = nn.Linear(dimension, dimension)
        self.v = nn.Parameter(torch.rand(dimension), requires_grad=True)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.epsilon = 1e-10

    def forward(self, h, mask):

        u_it = self.tanh(self.u(h))
        alpha = torch.exp(torch.matmul(u_it, self.v))
        alpha = mask * alpha + self.epsilon
        denominator_sum = torch.sum(alpha, dim=-1, keepdim=True)
        alpha = mask * (alpha / denominator_sum)

        output = h * alpha.unsqueeze(2)
        output = torch.sum(output, dim=1)

        return output, alpha
    
class BERTRA(nn.Module):

    def __init__(self):
        super(BERTRA, self).__init__()

        self.embedding_dim = 768

        self.bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True,
                                                                output_attentions=True)

        self.attention = BaseAttention(self.embedding_dim)

        self.sequential = nn.Sequential(
            nn.Linear(self.embedding_dim, 500),

            nn.BatchNorm1d(500),
            nn.Dropout(0.5),
            nn.Linear(500 , 100),

            nn.BatchNorm1d(100),
            nn.Dropout(0.5),
        )

        self.output = nn.Linear(100, 12) 


    def forward(self, sentences, mask):

        hidden, _ = self.bert(sentences)[-2:]
        sentences = hidden[-1]

        attention_applied, attention_weights = self.attention(sentences, mask.float())

        x = self.sequential(attention_applied)
        out = F.softmax(self.output(x), -1)


        return {
            'y_pred': out,
            'weights': attention_weights
        }





In [5]:
__all__ = ['TorchHelper']


class TorchHelper:
    checkpoint_history = []
    early_stop_monitor_vals = []
    best_score = 0
    best_epoch = 0

    def __init__(self):
        self.USE_GPU = torch.cuda.is_available()

    def show_progress(self, current_iter, total_iter, start_time, training_loss, additional_msg=''):
        bar_length = 50
        ratio = current_iter / total_iter
        progress_length = int(ratio * bar_length)
        percents = int(ratio * 100)
        bar = '[' + '=' * (progress_length - 1) + '>' + '-' * (bar_length - progress_length) + ']'

        current_time = time.time()
        # elapsed_time = time.gmtime(current_time - start_time).tm_sec
        elapsed_time = round(current_time - start_time, 0)
        estimated_time_needed = round((elapsed_time / current_iter) * (total_iter - current_iter), 0)

# sys.stdout.write
        print(
            'Iter {}/{}: {} {}%  Loss: {} ETA: {}s, Elapsed: {}s, TLI: {} {} '.format(current_iter, total_iter, bar,
                                                                                       percents,
                                                                                       round(training_loss, 4),
                                                                                       estimated_time_needed,
                                                                                       elapsed_time,
                                                                                       np.round(
                                                                                           elapsed_time / current_iter,
                                                                                           3), additional_msg), end = "\r")

        if current_iter < total_iter:
            sys.stdout.flush()
        else:
            sys.stdout.write('\n')

    def checkpoint_model(self, model_to_save, optimizer_to_save, path_to_save, current_score, epoch, mode='max'):
        """
        Checkpoints models state after each epoch.
        :param model_to_save:
        :param optimizer_to_save:
        :param path_to_save:
        :param current_score:
        :param epoch:
        :param n_epoch:
        :param mode:
        :return:
        """
        model_state = {'epoch'      : epoch + 1,
                       'model_state': model_to_save.state_dict(),
                       'score'      : current_score,
                       'optimizer'  : optimizer_to_save.state_dict()}

        # Save the model as a regular checkpoint
        torch.save(model_state, path_to_save + 'last.pth'.format(epoch))

        self.checkpoint_history.append(current_score)
        is_best = False

        # If the model is best so far according to the score, save as the best model state
        if ((np.max(self.checkpoint_history) == current_score and mode == 'max') or
                (np.min(self.checkpoint_history) == current_score and mode == 'min')):
            is_best = True
            self.best_score = current_score
            self.best_epoch = epoch
            # print('inside checkpoint', current_score, np.max(self.checkpoint_history))
            # torch.save(model_state, path_to_save + '{}_best.pth'.format(n_epoch))
            torch.save(model_state, path_to_save + 'best.pth')
            print('BEST saved')

        print('Current best', round(max(self.checkpoint_history), 4), 'after epoch {}'.format(self.best_epoch))

        return is_best


    def load_saved_model(self, model, path):
        """
        Load a saved model from dump
        :return:
        """
        # self.active_model.load_state_dict(self.best_model_path)['model_state']
        checkpoint = torch.load(path)
        model.load_state_dict(checkpoint['model_state'])

In [6]:
torch_helper = TorchHelper()
start_epoch = 0
batch_size = 4
max_epochs = 30
learning_rate = 0.00001
optimizer_type = 'adam'
l2_regularize = True
l2_lambda = 0.01


alphabet_path = "alphabet.json"

# Creates the directory where the results, logs, and models will be dumped.

run_name = 'bert_base_attn_adam_lr1e5'

description = ''


#output_dir_path = '../results/' + run_name + '/'
#if not os.path.exists(output_dir_path):
#    os.mkdir(output_dir_path)

run_mode = 'train'

In [7]:

# ----------------------------------------------------------------------------
# Load Data
# ----------------------------------------------------------------------------

features_train = json.load(open('../train.json'))
features_dev = json.load(open('../test.json'))

train_set = [val for key,val in features_train.items()]
print('Train Loaded')

validation_set = [val for key,val in features_dev.items()]
print('Validation Loaded')

# train_set = train_set[:100]
# validation_set = validation_set[:100]

print('Data Split: Train (%d), Dev (%d)' % (len(train_set), len(validation_set)))

Train Loaded
Validation Loaded
Data Split: Train (876), Dev (224)


In [8]:


# ----------------------------------------------------------------------------
# Functions
# ----------------------------------------------------------------------------

def create_model():
    """
    Creates and returns the model.
    Moves to GPU if found any.
    :return:
    """

    model = BERTRA()

    model.cuda()
    if run_mode == 'resume':
        torch_helper.load_saved_model(model, output_dir_path + 'best.pth')
        print('model loaded')
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    return model



def compute_l2_reg_val(model):
    if not l2_regularize:
        return 0.

    l2_reg = None

    for w in model.parameters():
        if l2_reg is None:
            l2_reg = w.norm(2)
        else:
            l2_reg = l2_reg + w.norm(2)

    return l2_lambda * l2_reg.item()

# ----------------------------------------------------------------------------
# Padding
# ----------------------------------------------------------------------------

def pad_features(docs_ints, seq_length=700):

    # getting the correct rows x cols shape
    features = np.zeros((len(docs_ints), seq_length), dtype=int)

    # for each review, I grab that review and
    for i, row in enumerate(docs_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    return features

def masking(docs_ints, seq_length=700):

    # getting the correct rows x cols shape
    masks = np.zeros((len(docs_ints), seq_length), dtype=int)

    # for each review, I grab that review and
    for i, row in enumerate(docs_ints):
        #mask[i, :len(row)] = 1
        masks[i, -len(row):] = 1

    return masks

In [9]:
# Training loop
# ----------------------------------------------------------------------------

def train(model, optimizer, shuffled_train_set):
    """
    Trains the model using the optimizer for a single epoch.
    :param model: pytorch model
    :param optimizer:
    :return:
    """

    start_time = time.time()

    model.train()

    batch_idx = 1
    total_loss = 0
    batch_x,batch_y, = [], []
    random.Random(1234).shuffle(shuffled_train_set)

    for i in range(len(shuffled_train_set)):

        batch_x.append(shuffled_train_set[i]['tokenized'])

        batch_y.append(shuffled_train_set[i]['y'])

        if len(batch_x) == batch_size or i == len(shuffled_train_set) - 1:

            optimizer.zero_grad()

            mask = masking(batch_x)
            padded = pad_features(batch_x)


            out = model(torch.tensor(padded,dtype=torch.long).cuda(), torch.tensor(mask,dtype=torch.long).cuda())

            y_pred = out['y_pred'].cpu()
            loss = F.cross_entropy(y_pred, torch.max(torch.Tensor(batch_y), 1)[1]) + compute_l2_reg_val(model)

            total_loss += loss.item()
            loss.backward()

            optimizer.step()

            torch_helper.show_progress(batch_idx , np.ceil(len(shuffled_train_set) / batch_size), start_time,
                                   round(total_loss / (i+1), 4))
            batch_idx += 1
            batch_x,  batch_y = [], []
            
    return model, shuffled_train_set

# ----------------------------------------------------------------------------
# Evaluate the model
# ----------------------------------------------------------------------------
def evaluate(model, dev_set):

    model.eval()

    total_loss = 0
    batch_x, batch_y = [], []
    y_true = []

    label_predictions = []

    with torch.no_grad():
        for i in range(len(dev_set)):

            batch_x.append(dev_set[i]['tokenized'])

            batch_y.append(dev_set[i]['y'])


            y_true.append(label2idx[dev_set[i]['label']])

            if len(batch_x) == batch_size or i == len(dev_set) - 1:
                mask = masking(batch_x)
                padded = pad_features(batch_x)



                # out = model(torch.tensor(padded).cuda(), batch_emoj.cuda(), torch.tensor(mask).cuda())
                out = model(torch.tensor(padded,dtype=torch.long).cuda(), torch.tensor(mask,dtype=torch.long).cuda())

                y_pred = out['y_pred'].cpu()

                label_predictions.extend(list(torch.argmax(y_pred, -1).numpy()))

                loss = F.cross_entropy(y_pred, torch.max(torch.Tensor(batch_y), 1)[1]) 

                total_loss += loss.item()

                batch_x, batch_y, = [], []

    weighted_f1 = f1_score(y_true, label_predictions, average='weighted')

    return label_predictions, \
           total_loss/len(dev_set), \
           weighted_f1

In [10]:


# ----------------------------------------------------------------------------




def training_loop():
    """
    :return:
    """
    model = create_model()


    if optimizer_type == 'adamW':
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.0)
    elif optimizer_type == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer_type == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

    shuffled_train_set = train_set

    for epoch in range(start_epoch, max_epochs):

        # for p in model.tm.parameters():
        #     p.requires_grad = False

        for p in model.bert.parameters():
            p.requires_grad = False


        print('[Epoch %d] / %d : %s' % (epoch + 1, max_epochs, run_name))

        model, shuffled_train_set = train(model, optimizer, shuffled_train_set)

        #print("Training Done!")

        val_label_pred,  val_loss, val_label_f1 = evaluate(model, validation_set)
        train_label_pred,  train_loss, train_label_f1 = evaluate(model, train_set)

        #print('Evaluation Done!')

        current_lr = 0
        for pg in optimizer.param_groups:
            current_lr = pg['lr']

        print('Training Loss %.5f, Validation Loss %.5f' % (train_loss, val_loss))
        print('Training Label weighted F1 %.5f, Validation Label weighted F1 %.5f' % (train_label_f1, val_label_f1))
        # print('Learning Rate', current_lr)


        is_best = torch_helper.checkpoint_model(model, optimizer, '..', val_label_f1, epoch + 1,
                                                'max')
    return model

In [11]:
label2idx = {'AskIndia': 0,
 'Non-Political': 1,
 '[R]eddiquette': 2,
 'Scheduled': 3,
 'Photography': 4,
 'Science/Technology': 5,
 'Politics': 6,
 'Business/Finance': 7,
 'Policy/Economy': 8,
 'Sports': 9,
 'Food': 10,
 'AMA': 11}

In [None]:
model = training_loop()

[Epoch 1] / 30 : bert_base_attn_adam_lr1e5
Training Loss 0.62012, Validation Loss 0.61971
Training Label weighted F1 0.09578, Validation Label weighted F1 0.07330
BEST saved
Current best 0.0733 after epoch 1
[Epoch 2] / 30 : bert_base_attn_adam_lr1e5
Training Loss 0.61916, Validation Loss 0.61900
Training Label weighted F1 0.09519, Validation Label weighted F1 0.07259
Current best 0.0733 after epoch 1
[Epoch 3] / 30 : bert_base_attn_adam_lr1e5
Training Loss 0.61877, Validation Loss 0.61882
Training Label weighted F1 0.11422, Validation Label weighted F1 0.12775
BEST saved
Current best 0.1278 after epoch 3
[Epoch 4] / 30 : bert_base_attn_adam_lr1e5
Training Loss 0.61809, Validation Loss 0.61861
Training Label weighted F1 0.09443, Validation Label weighted F1 0.08945
Current best 0.1278 after epoch 3
[Epoch 5] / 30 : bert_base_attn_adam_lr1e5
Training Loss 0.61773, Validation Loss 0.61819
Training Label weighted F1 0.10040, Validation Label weighted F1 0.13204
BEST saved
Current best 0.1