# Foreword:
To run in Colab, we need to have the files in Drive. 
To do this easily, we have to follow a couple of steps:
1. Go to Federico's `NLP_project` folder in Drive [here](https://drive.google.com/drive/folders/16Gm33Ckb_YoX_z_x9xVITt2afa-aAPLX?usp=sharing), and Add a shortcut to your drive.
2. Mount google drive on Colab by running the code cells that will follow.
3. Done, the directory structure will look like this:
```
YOUR_GOOGLE_DRIVE/
└── COMP0087/
    ├── data/
    │   ├── test
    │   ├── train
    │   ├── train.csv
    │   └── sample_submission.csv
    ├── model
    └── output
```
4. Make sure you change the directory you are using in the `HyperParameters` class defined below to `/content/drive/MyDrive/NLP_project`
I have already done this automatically by setting a cd to that folder if we are on colab, I am writing this just so that you are aware.

In [1]:
ON_COLAB = True
if ON_COLAB:
  # Mount drive:
  from google.colab import drive, files
  # mount Google Drive
  drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Imports

In [2]:
%%capture
# if on Colab, we need to install missing stuff!
if ON_COLAB:
  !pip install transformers
  !pip install iterative-stratification
  !pip install nvidia-ml-py3

In [3]:
import gc
import os
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
from pynvml import nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo, nvmlInit

from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [4]:
def print_gpu_utilization():
    print(f"GPU memory occupied: {get_gpu_utilization()} MB.")

def get_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    return info.used//1024**2

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

print_gpu_utilization()

GPU memory occupied: 0 MB.


In [5]:
if ON_COLAB:
  !cd /content/drive/MyDrive/NLP_project


# DATA DIR ---- TO CHANGE
DATA_DIR = 'drive/MyDrive/NLP_project/'

Config class containing all necessary hyperparameters:

In [6]:
class HyperParameters:
    
    # Here we choose model type. Can be changed for others
    name = 'longformer'
    model_savename = 'longformer'
    model_name = 'allenai/longformer-base-4096'      # this is the most important: determines what transformer is used in training
    
    # Directory hyperparameters: make sure to change with what you are using! Only needed to change here
    base_dir = DATA_DIR
    data_dir = os.path.join(base_dir, 'data')
    pre_data_dir = os.path.join(base_dir, 'data/preprocessed')
    model_dir = os.path.join(base_dir, f'model/{name}')
    output_dir = os.path.join(base_dir, f'output/{name}')
    
    # Training hyperparameters
    is_debug = False
    n_epoch = 2 # not to exceed runtime limit
    n_fold = 5
    verbose_steps = 500
    random_seed = 42

    # Model specific hyperparameters
    max_length = 1024
    inference_max_length = 4096
    train_batch_size = 4
    valid_batch_size = 4
    lr = 4e-5

    # Task hyperparameters
    num_labels = 15
    label_subtokens = True
    output_hidden_states = True
    hidden_dropout_prob = 0.1
    layer_norm_eps = 1e-7
    add_pooling_layer = False
    verbose_steps = 500
    if is_debug:
        debug_sample = 1000
        verbose_steps = 16
        n_epoch = 1
        n_fold = 2

if not os.path.exists(HyperParameters.model_dir):
    !mkdir $HyperParameters.model_dir

Constant for the task:

In [7]:
IGNORE_INDEX = -100
NON_LABEL = -1
OUTPUT_LABELS = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
                 'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
LABELS_TO_IDS = {v:k for k,v in enumerate(OUTPUT_LABELS)}
IDS_TO_LABELS = {k:v for k,v in enumerate(OUTPUT_LABELS)}

MIN_THRESH = {
    "I-Lead": 9,
    "I-Position": 5,
    "I-Evidence": 14,
    "I-Claim": 3,
    "I-Concluding Statement": 11,
    "I-Counterclaim": 6,
    "I-Rebuttal": 4,
}

PROB_THRESH = {
    "I-Lead": 0.7,
    "I-Position": 0.55,
    "I-Evidence": 0.65,
    "I-Claim": 0.55,
    "I-Concluding Statement": 0.7,
    "I-Counterclaim": 0.5,
    "I-Rebuttal": 0.55,
}

Taming randomness and setting device

In [8]:
def set_seed(seed=HyperParameters.random_seed):
    np.random.seed(seed)
    
    random.seed(seed)
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    torch.backends.cudnn.deterministic =True
    torch.backends.cudnn.benchmark = False

set_seed()

# Set proper device
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(f'Using device: {device}')

Using device: cuda


# Data importing and preprocessing

Importing corrected data

In [9]:
df_alltrain = pd.read_csv(f'{HyperParameters.data_dir}/corrected_train.csv')

In [10]:
def agg_essays(train_flg):
    """
    Splits every word in an essay and adds the text of each essay to a dataframe.
    """
    folder = 'train' if train_flg else 'test'
    names, texts =[], []
    for f in tqdm(list(os.listdir(f'{HyperParameters.data_dir}/{folder}'))):
        names.append(f.replace('.txt', ''))
        texts.append(open(f'{HyperParameters.data_dir}/{folder}/' + f, 'r').read())
        df_texts = pd.DataFrame({'id': names, 'text': texts})

    df_texts['text_split'] = df_texts.text.str.split()
    print('Completed tokenizing texts.')
    return df_texts

In [11]:
def ner(df_texts, df_train):
    """
    Maps discourse type to each word of the text, according to the train.csv file.
    """
    all_entities = []
    for _,  row in tqdm(df_texts.iterrows(), total=len(df_texts)):
        total = len(row['text_split'])
        entities = ['O'] * total

        for _, row2 in df_train[df_train['id'] == row['id']].iterrows():
            discourse = row2['discourse_type']
            list_ix = [int(x) for x in row2['predictionstring'].split(' ')]
            entities[list_ix[0]] = f'B-{discourse}'
            for k in list_ix[1:]: entities[k] = f'I-{discourse}'
        all_entities.append(entities)

    df_texts['entities'] = all_entities
    print('Completed mapping discourse to each token.')
    return df_texts

In [12]:
def preprocess(df_train = None):
    """
    Generates the dataframe we will use for training.
    Splits essays into words, assigns a token name to each word, and adds everything to a dataframe.
    """
    if df_train is None:
        train_flg = False
    else:
        train_flg = True
    
    df_texts = agg_essays(train_flg)

    if train_flg:
        df_texts = ner(df_texts, df_train)
    return df_texts

# Make sure we only run pre-processing if we did not do it in the past:

if not os.path.exists(f"{HyperParameters.data_dir}/train_folds.csv"): 
    alltrain_texts = preprocess(df_alltrain)
    test_texts = preprocess()
else:
    alltrain_texts = pd.read_csv(f"{HyperParameters.data_dir}/train_folds.csv")

In [13]:
# Visualize preprocessing result:
parse_string = lambda x: [string[1:-1] for string in x[1:-1].split(', ')]
alltrain_texts.entities = alltrain_texts.entities.apply(parse_string)
alltrain_texts.text_split = alltrain_texts.text_split.apply(parse_string)

alltrain_texts.head()

Unnamed: 0,id,text,text_split,entities,kfold
0,3321A3E87AD3,I do agree that some students would benefit fr...,"[I, do, agree, that, some, students, would, be...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",2
1,DFEAEC512BAB,Should students design a summer project for sc...,"[Should, students, design, a, summer, project,...","[O, O, O, O, O, O, O, O, B-Position, I-Positio...",4
2,2E4AFCD3987F,"Dear State Senator\n\n,\n\nIn the ruels of vot...","[Dear, State, Senator, ,, In, the, ruels, of, ...","[O, O, O, O, B-Position, I-Position, I-Positio...",0
3,EB6C2AF20BFE,People sometimes have a different opinion than...,"[People, sometimes, have, a, different, opinio...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",3
4,A91A08E523D5,"Dear senator,\n\nAs you know the Electoral Col...","[Dear, senator,, As, you, know, the, Electoral...","[O, O, B-Lead, I-Lead, I-Lead, I-Lead, I-Lead,...",1


# Preparing cross validation

Generate proper folds so that the essays we use in each fold have roughly the same number of discourse types overall.
Only compute if we don't have the file in directory already.

In [14]:
if not os.path.exists(f"{HyperParameters.data_dir}/train_folds.csv"): 
    # Transform categorical labels to dummy variables. Group by id. Sum over dummy. 
    dfx = pd.get_dummies(df_alltrain, columns=["discourse_type"]).groupby(["id"], as_index=False).sum()

    # Generate name for the dummy columns
    dummy_cols = [c for c in dfx.columns if c.startswith("discourse_type_") or c == "id" and c != "discourse_type_num"]
    # dfx is now only the dataset with dummy columns selected: don't need to pass the data to do the splits
    dfx = dfx[dummy_cols]

In [15]:
if not os.path.exists(f"{HyperParameters.data_dir}/train_folds.csv"): 
    # Generate cross validation object
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Extract labels
    labels = [c for c in dfx.columns if c != "id"]
    dfx_labels = dfx[labels]

    # Dummy kfold assignment
    dfx["kfold"] = -1

    # Split
    for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
        print(len(trn_), len(val_))
        
        # Change the value of the kfold column at the validation index to the value of the fold
        # This will tell us when to use the current entry in the validation set
        dfx.loc[val_, "kfold"] = fold

    # merge back to original dataframe
    alltrain_texts = alltrain_texts.merge(dfx[["id", "kfold"]], on="id", how="left")
    print(alltrain_texts.kfold.value_counts())

    # Save so next time we import it directly
    alltrain_texts.to_csv(f"{HyperParameters.data_dir}/train_folds.csv", index=False)

# Model and Dataset classes

### Dataset

In [16]:
# need help with this
class FeedbackPrizeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, has_labels):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_labels = has_labels
    
    def __getitem__(self, index):
        text = self.data.text[index]
        encoding = self.tokenizer(
            text.split(),
            is_split_into_words = True,
            padding = 'max_length',
            truncation = True,
            max_length = self.max_len
        )
        word_ids = encoding.word_ids()

        # targets
        if self.has_labels:
            word_labels = self.data.entities[index]
            prev_word_idx = None
            labels_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    labels_ids.append(IGNORE_INDEX)
                elif word_idx != prev_word_idx:
                    labels_ids.append(LABELS_TO_IDS[word_labels[word_idx]])
                else:
                    if HyperParameters.label_subtokens:
                        labels_ids.append(LABELS_TO_IDS[word_labels[word_idx]])
                    else:
                        labels_ids.append(IGNORE_INDEX)
                prev_word_idx = word_idx
            encoding['labels'] = labels_ids
        # convert to torch.tensor
        item = {k: torch.as_tensor(v) for k, v in encoding.items()}
        word_ids2 = [w if w is not None else NON_LABEL for w in word_ids]
        item['word_ids'] = torch.as_tensor(word_ids2)
        return item

    def __len__(self):
        return self.len

### Model

In [37]:
class FeedbackModel(nn.Module):
    def __init__(self):
        super(FeedbackModel, self).__init__()
        
        # init config of transformer model of choice:
        # NOTE: All hyperparameters of the transformer, INCLUDING THE SLIDING WINDOW, are accessible in here!
        model_config = AutoConfig.from_pretrained(HyperParameters.model_name)
        print(model_config)
        self.backbone = AutoModel.from_pretrained(HyperParameters.model_name, config=model_config)
        
        # There's a paper on why this weird dropout strategy is beneficial: https://arxiv.org/abs/1905.09788
        self.model_config = model_config
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.head = nn.Linear(model_config.hidden_size, HyperParameters.num_labels)
    
    def forward(self, input_ids, mask):
        x = self.backbone(input_ids, mask)
        logits1 = self.head(self.dropout1(x[0]))
        logits2 = self.head(self.dropout2(x[0]))
        logits3 = self.head(self.dropout3(x[0]))
        logits4 = self.head(self.dropout4(x[0]))
        logits5 = self.head(self.dropout5(x[0]))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5
        return logits

In [18]:
def build_model_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(HyperParameters.model_name, add_prefix_space = True)
    model = FeedbackModel()
    return model, tokenizer

### Utilities

What does this do?

In [19]:
# Need help with this: used in training to transform raw logits to labels needed
def active_logits(raw_logits, word_ids):
    word_ids = word_ids.view(-1)
    active_mask = word_ids.unsqueeze(1).expand(word_ids.shape[0], HyperParameters.num_labels)
    active_mask = active_mask != NON_LABEL
    active_logits = raw_logits.view(-1, HyperParameters.num_labels)
    active_logits = torch.masked_select(active_logits, active_mask) # return 1dTensor
    active_logits = active_logits.view(-1, HyperParameters.num_labels) 
    return active_logits

def active_labels(labels):
    active_mask = labels.view(-1) != IGNORE_INDEX
    active_labels = torch.masked_select(labels.view(-1), active_mask)
    return active_labels

def active_preds_prob(active_logits):
    active_preds = torch.argmax(active_logits, axis = 1)
    active_preds_prob, _ = torch.max(active_logits, axis = 1)
    return active_preds, active_preds_prob

F1 scoring functions:

In [20]:
def calculate_overlap(set_pred, set_gt):
    """
    Calculates if the overlap between prediction and
    ground truth is enough fora potential True positive
    """
    # Length of each and intersection
    try:
        len_gt = len(set_gt)
        len_pred = len(set_pred)
        inter = len(set_gt & set_pred)
        overlap_1 = inter / len_gt
        overlap_2 = inter/ len_pred
        return overlap_1 >= 0.5 and overlap_2 >= 0.5
    except:  # at least one of the input is NaN
        return False

def score_feedback_comp_micro(pred_df, gt_df, discourse_type):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df.loc[gt_df['discourse_type'] == discourse_type, 
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df = pred_df.loc[pred_df['class'] == discourse_type,
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    pred_df['predictionstring'] = [set(pred.split(' ')) for pred in pred_df['predictionstring']]
    gt_df['predictionstring'] = [set(pred.split(' ')) for pred in gt_df['predictionstring']]
    
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on='id',
                           right_on='id',
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    overlaps = [calculate_overlap(*args) for args in zip(joined.predictionstring_pred, 
                                                     joined.predictionstring_gt)]
    
    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    # we don't need to compute the match to compute the score
    TP = joined.loc[overlaps]['gt_id'].nunique()

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    TPandFP = len(pred_df)
    TPandFN = len(gt_df)
    
    #calc microf1
    my_f1_score = 2*TP / (TPandFP + TPandFN)
    return my_f1_score

def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    """
    Final helper function for model evaluation.
    
    Args:
    pred_df  (pandas.DataFrame): dataframe containing model predictions. Needs to have columns: ['id','class','predictionstring']
    gt_df    (pandas.DataFrame): dataframe of ground truth used for model training
    return_class_scores  (bool): Boolean indicating if we want to return the F1 score for each predicted class.
    
    Returns:
    f1                      (float): F1 score of the model
    (optional) class_scores  (dict): Dictionary of per-class F1 score
    """
    class_scores = {}
    for discourse_type in gt_df.discourse_type.unique():
        class_score = score_feedback_comp_micro(pred_df, gt_df, discourse_type)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

# Training and validation functions

### Train

In [45]:
def train_fn(model, train_data_loader, optimizer, epoch, criterion):
    print(f'Training for epoch {epoch} started. GPU utilisation: {get_gpu_utilization()}')
    model.train()
    train_loss = 0
    train_accuracy = 0
    stream = tqdm(train_data_loader)
    # Init gradscaler to ensure everything works smoothly on cuda
    scaler = GradScaler()
    print(f'GradScaler initialised. GPU utilisation: {get_gpu_utilization()}')
    for batch_idx, batch in enumerate(stream, start = 1):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        raw_labels = batch['labels'].to(device, dtype = torch.long)
        word_ids = batch['word_ids'].to(device, dtype = torch.long)
        print(f'Tensors added to GPU. GPU utilisation: {get_gpu_utilization()}')

        optimizer.zero_grad()
        print(f'Optimizer Reset. GPU utilisation: {get_gpu_utilization()}')

        # Calculate output with autocast for cuda support
        with autocast():
            raw_logits = model(input_ids = ids, mask = mask)
        print(f'Model trained. GPU utilisation: {get_gpu_utilization()}')

        #logits = active_logits(raw_logits, word_ids)
        #labels = active_labels(raw_labels)

        logits = raw_logits
        labels = raw_labels
        sf_logits = torch.softmax(logits, dim=-1)
        preds, preds_prob = active_preds_prob(sf_logits)
        train_accuracy += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
        criterion = nn.CrossEntropyLoss()
        loss = criterion(logits, labels)

        print(f'Loss calculated. GPU utilisation: {get_gpu_utilization()}')

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_loss += loss.item()
        print(f'Optimization step occured. GPU utilisation: {get_gpu_utilization()}')

        
        if batch_idx % HyperParameters.verbose_steps == 0:
            loss_step = train_loss / batch_idx
            print(f'Training loss after {batch_idx:04d} training steps: {loss_step}')

        if batch_idx == 3:
          raise Exception()
            
    epoch_loss = train_loss / batch_idx
    epoch_accuracy = train_accuracy / batch_idx    
    print_gpu_utilization()
    # Cleanup
    del train_data_loader, raw_logits, logits, raw_labels, preds, labels
    torch.cuda.empty_cache()
    gc.collect()
    
    print(f'epoch {epoch} - training loss: {epoch_loss:.4f}')
    print(f'epoch {epoch} - training accuracy: {epoch_accuracy:.4f}')

### Validate

In [22]:
def valid_fn(model, df_val, df_val_eval, dl_val, epoch, criterion):
    oof, valid_loss, valid_acc  = get_preds_onefold(model, df_val, dl_val, criterion, valid_flg=True)
    f1score =[]
    # classes = oof['class'].unique()
    classes = ['Lead', 'Position', 'Claim','Counterclaim', 'Rebuttal','Evidence','Concluding Statement']
    print(f"Validation F1 scores")

    for c in classes:
        pred_df = oof.loc[oof['class'] == c].copy()
        gt_df = df_val_eval.loc[df_val_eval['discourse_type'] == c].copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        print(f' * {c:<10}: {f1:4f}')
        f1score.append(f1)
    f1avg = np.mean(f1score)
    print(f'Overall Validation avg F1: {f1avg:.4f} val_loss:{valid_loss:.4f} val_accuracy:{valid_acc:.4f}')
    return valid_loss, oof

### Infer on validation data

In [23]:
def inference(model, data_loader, criterion, valid_flg):
    stream = tqdm(data_loader)
    model.eval()
    
    valid_loss = 0
    valid_accuracy = 0
    all_logits = None
    for batch_idx, batch in enumerate(stream, start = 1):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        with torch.no_grad():
            raw_logits = model(input_ids=ids, mask = mask)
        del ids, mask
        
        word_ids = batch['word_ids'].to(device, dtype = torch.long)
        logits = active_logits(raw_logits, word_ids)
        sf_logits = torch.softmax(logits, dim= -1)
        sf_raw_logits = torch.softmax(raw_logits, dim=-1)
        if valid_flg:    
            raw_labels = batch['labels'].to(device, dtype = torch.long)
            labels = active_labels(raw_labels)
            preds, preds_prob = active_preds_prob(sf_logits)
            valid_accuracy += accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
            loss = criterion(logits, labels)
            valid_loss += loss.item()
        
        if batch_idx == 1:
            all_logits = sf_raw_logits.cpu().numpy()
        else:
            all_logits = np.append(all_logits, sf_raw_logits.cpu().numpy(), axis=0)

    
    if valid_flg:        
        epoch_loss = valid_loss / batch_idx
        epoch_accuracy = valid_accuracy / batch_idx
    else:
        epoch_loss, epoch_accuracy = 0, 0
    return all_logits, epoch_loss, epoch_accuracy


def preds_class_prob(all_logits, data_loader):
    print("predict target class and its probabilty")
    final_predictions = []
    final_predictions_score = []
    stream = tqdm(data_loader)
    len_sample = all_logits.shape[0]

    for batch_idx, batch in enumerate(stream, start=0):
        for minibatch_idx in range(HyperParameters.valid_batch_size):
            sample_idx = int(batch_idx * HyperParameters.valid_batch_size + minibatch_idx)
            if sample_idx > len_sample - 1 : break
            word_ids = batch['word_ids'][minibatch_idx].numpy()
            predictions =[]
            predictions_prob = []
            pred_class_id = np.argmax(all_logits[sample_idx], axis=1)
            pred_score = np.max(all_logits[sample_idx], axis=1)
            pred_class_labels = [IDS_TO_LABELS[i] for i in pred_class_id]
            prev_word_idx = -1
            for idx, word_idx in enumerate(word_ids):
                if word_idx == -1:
                    pass
                elif word_idx != prev_word_idx:
                    predictions.append(pred_class_labels[idx])
                    predictions_prob.append(pred_score[idx])
                    prev_word_idx = word_idx
            final_predictions.append(predictions)
            final_predictions_score.append(predictions_prob)
    return final_predictions, final_predictions_score

In [24]:
def get_preds_onefold(model, df, dl, criterion, valid_flg):
    logits, valid_loss, valid_acc = inference(model, dl, criterion, valid_flg)
    all_preds, all_preds_prob = preds_class_prob(logits, dl)
    df_pred = post_process_pred(df, all_preds, all_preds_prob)
    return df_pred, valid_loss, valid_acc

def get_preds_folds(model, df, dl, criterion, valid_flg=False):
    for i_fold in range(HyperParameters.n_fold):
        model_filename = os.path.join(HyperParameters.model_dir, f"{HyperParameters.model_savename}_{i_fold}.bin")
        print(f"{model_filename} inference")
        model = model.to(device)
        model.load_state_dict(torch.load(model_filename))
        logits, valid_loss, valid_acc = inference(model, dl, criterion, valid_flg)
        if i_fold == 0:
            avg_pred_logits = logits
        else:
            avg_pred_logits += logits
    avg_pred_logits /= HyperParameters.n_fold
    all_preds, all_preds_prob = preds_class_prob(avg_pred_logits, dl)
    df_pred = post_process_pred(df, all_preds, all_preds_prob)
    return df_pred

def post_process_pred(df, all_preds, all_preds_prob):
    final_preds = []
    for i in range(len(df)):
        idx = df.id.values[i]
        pred = all_preds[i]
        pred_prob = all_preds_prob[i]
        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == 'O': j += 1
            else: cls = cls.replace('B', 'I')
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            if cls != 'O' and cls !='':
                avg_score = np.mean(pred_prob[j:end])
                if end - j > MIN_THRESH[cls] and avg_score > PROB_THRESH[cls]:
                    final_preds.append((idx, cls.replace('I-', ''), ' '.join(map(str, list(range(j, end))))))
            j = end
    df_pred = pd.DataFrame(final_preds)
    df_pred.columns = ['id', 'class', 'new_predictionstring']
    return df_pred

# Finally getting some action

In [25]:
print_gpu_utilization()

GPU memory occupied: 3 MB.


In [26]:
def pretty_size(size):
	"""Pretty prints a torch.Size object"""
	assert(isinstance(size, torch.Size))
	return " × ".join(map(str, size))

def dump_tensors(gpu_only=True):
	"""Prints a list of the Tensors being tracked by the garbage collector."""
	import gc
	total_size = 0
	for obj in gc.get_objects():
		try:
			if torch.is_tensor(obj):
				if not gpu_only or obj.is_cuda:
					print("%s:%s%s %s" % (type(obj).__name__, 
										  " GPU" if obj.is_cuda else "",
										  " pinned" if obj.is_pinned else "",
										  pretty_size(obj.size())))
					total_size += obj.numel()
			elif hasattr(obj, "data") and torch.is_tensor(obj.data):
				if not gpu_only or obj.is_cuda:
					print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
												   type(obj.data).__name__, 
												   " GPU" if obj.is_cuda else "",
												   " pinned" if obj.data.is_pinned else "",
												   " grad" if obj.requires_grad else "", 
												   " volatile" if obj.volatile else "",
												   pretty_size(obj.data.size())))
					total_size += obj.data.numel()
		except Exception as e:
			pass        
	print("Total size:", total_size)

In [27]:
dump_tensors()

  "torch.distributed.reduce_op is deprecated, please use "


Total size: 0


In [35]:
class KaggleDataset(Dataset):
    """
    Class for loading data in batches after it has been processed
    """
    def __init__(self, dataframe, tokenizer, max_length):

        super().__init__()

        # -- prepare data
        assert sorted(dataframe.columns) == ['labels', 'text'], f"Please make sure input dataframe has the columns (text, labels)"
        # data must be in the correct format
        self.inputs = dataframe.text.values
        self.targets = dataframe.labels.values
        #if not is_string_dtype(self.inputs): raise TypeError('Text data must be string type')
        # TODO assertion below is bug; not deleting so remember to add correct assertions
        #if not is_integer_dtype(self.targets): raise TypeError('Label data must be integer type')

        # -- prepare tokenizer
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        # self.inputs anf self.targets must be of a type that is indexible as shown
        inputs = self.inputs[index]
        targets = self.targets[index]

        inputs = self.tokenizer(
            # consider parametrising these
            inputs.split(),
            is_split_into_words=True, # this means that extra \n should be ignored
            padding='max_length',
            truncation=True,
            max_length=self.max_length
        )

        word_ids = inputs.word_ids()
        word_id_mask = [word_id is not None for word_id in word_ids]
        word_ids = [word_id for word_id in word_ids if word_id is not None]

        inputs = {
            key: torch.as_tensor(val, dtype=torch.long) for key, val in inputs.items()
        }
        targets = torch.as_tensor(targets, dtype=torch.long)
        expanded_targets = torch.zeros(self.max_length, dtype=torch.long)
        expanded_targets[word_id_mask] = targets[word_ids]
        inputs['labels'] = expanded_targets
        inputs['word_ids'] = torch.tensor(word_ids, dtype=torch.long)
        return inputs

  

In [47]:
try:
  print(gc.get_referrers(model))
  raise Exception()
  del model, 
  torch.cuda.empty_cache()
  gc.collect()
  print('Model existed. Deleting...')
  print_gpu_utilization()
except:
  print('Model NOT found... new model will be created')
  print_gpu_utilization()

MAPPING = {
    "O": 0,
    "B-Lead": 1,
    "B-Position": 2,
    "B-Evidence": 3,
    "B-Claim": 4,
    "B-Concluding Statement": 5,
    "B-Counterclaim": 6,
    "B-Rebuttal": 7,
    "I-Lead": 8,
    "I-Position": 9,
    "I-Evidence": 10,
    "I-Claim": 11,
    "I-Concluding Statement": 12,
    "I-Counterclaim": 13,
    "I-Rebuttal": 14,
}
oof = pd.DataFrame()

for i_fold in range(HyperParameters.n_fold):
    print(f'=== fold{i_fold} training ===')
    print(f'First run: {print_gpu_utilization()}')
    model, tokenizer = build_model_tokenizer()
    model = model.to(device)
    print(f'Model and tokeniser: {print_gpu_utilization()}')

    optimizer = torch.optim.Adam(params=model.parameters(), lr=HyperParameters.lr)
    print(f'Optimizer: {print_gpu_utilization()}')

    df_train = alltrain_texts[alltrain_texts["kfold"] != i_fold].reset_index(drop = True)
    df_train = df_train.assign(labels=lambda x: x['entities'].apply(lambda x: [MAPPING[key] for key in x]))
    ds_train = FeedbackPrizeDataset(df_train, tokenizer, HyperParameters.max_length, True)
    df_val = alltrain_texts[alltrain_texts["kfold"] == i_fold].reset_index(drop = True)
    val_idlist = df_val['id'].unique().tolist()
    df_val_eval = df_alltrain.query('id==@val_idlist').reset_index(drop=True)
    ds_val = FeedbackPrizeDataset(df_val, tokenizer, HyperParameters.max_length, True)
    dl_train = DataLoader(ds_train, batch_size=HyperParameters.train_batch_size, shuffle=True, num_workers=2, pin_memory=True)
    dl_val = DataLoader(ds_val, batch_size=HyperParameters.valid_batch_size, shuffle=False, num_workers=2, pin_memory=True)

    dl_train = DataLoader(KaggleDataset(df_train[['text', 'labels']], tokenizer, 532))
    best_val_loss = np.inf
    criterion = nn.CrossEntropyLoss()
    print(f'Prior to training, GPU utilisation is: {print_gpu_utilization()}')
    for epoch in range(1, HyperParameters.n_epoch + 1):
        train_fn(model, dl_train, optimizer, epoch, criterion)
        valid_loss, _oof = valid_fn(model, df_val, df_val_eval, dl_val, epoch, criterion)
        if valid_loss < best_val_loss:
            best_val_loss = valid_loss
            _oof_fold_best = _oof
            _oof_fold_best["kfold"] = i_fold
            model_filename = f'{HyperParameters.model_dir}/{HyperParameters.model_savename}_{i_fold}.bin'
            
            # Saving the boy
            torch.save(model.state_dict(), model_filename)
            print(f'{model_filename} saved')

    oof = pd.concat([oof, _oof_fold_best])

[<frame at 0x557474b03c20, file '<ipython-input-45-be461fdb2349>', line 32, code train_fn>, {'model': FeedbackModel(
  (backbone): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0): LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_gl

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


GPU memory occupied: 9131 MB.
Model and tokeniser: None
GPU memory occupied: 9131 MB.
Optimizer: None
GPU memory occupied: 9131 MB.
Prior to training, GPU utilisation is: None
Training for epoch 1 started. GPU utilisation: 9131


  0%|          | 0/12477 [00:00<?, ?it/s]

GradScaler initialised. GPU utilisation: 9131
Tensors added to GPU. GPU utilisation: 9131
Optimizer Reset. GPU utilisation: 9131
Model trained. GPU utilisation: 10505


ValueError: ignored

In [None]:
oof.to_csv(f'{HyperParameters.output_dir}/oof_{HyperParameters.name}.csv', index=False)

Looking at performance:

In [None]:
if HyperParameters.is_debug:
    idlist = alltrain_texts['id'].unique().tolist()
    df_train = df_alltrain.query('id==@idlist')
else:
    df_train = df_alltrain.copy()
print(f'overall cv score: {score_feedback_comp(df_train, oof)}')