# Dataset
-------------------------------

## Mounting and reading file



In [None]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

# Copy the model files back from Google Drive to the Colab instance. 
#!cp -r "./drive/My Drive/BERT Document Classification Tutorial/model_save/" ./model_save/
!cp -r "./drive/My Drive/SemEval2023/t1/data" ./


Mounted at /content/drive


In [None]:
import pandas as pd

print('Parsing the dataset .tsv file...')
data = pd.read_csv('./data/train_1_all.tsv', sep = '\t', index_col = 0)
print('    Done.')

Parsing the dataset .tsv file...
    Done.


In [None]:
data.head()

Unnamed: 0_level_0,text,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1
833042063,Chelsea Handler Admits She’s ‘Very Sexually At...,satire
832959523,How Theresa May Botched\n\nThose were the time...,satire
833039623,Robert Mueller III Rests His Case—Dems NEVER W...,satire
833032367,Robert Mueller Not Recommending Any More Indic...,satire
814777937,The Far Right Is Trying to Co-opt the Yellow V...,satire


In [None]:
data.groupby('type').count()

Unnamed: 0_level_0,text
type,Unnamed: 1_level_1
opinion,382
reporting,180
satire,63


In [None]:
total_comments = len(data)
num_attacks=382

print('{:,} of {:,} articles are opinion ({:.2%})'.format(num_attacks, total_comments, num_attacks/total_comments))

382 of 625 articles are opinion (61.12%)


## Feature Engineering

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import numpy as np

data["n_words"]=data['text'].apply(lambda x: len(str(x).split()))
data["n_words_unique"]=data['text'].apply(lambda x: len(set(str(x).split())))
data["n_stopwords"]=data['text'].apply(lambda x: len([w for w in str(x).split() if w in stopwords.words('english')]))
data["n_punct"]=data['text'].apply(lambda x: len([w for w in str(x) if w in list(string.punctuation)]))
data["mean_w_len"]=data['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
data["n_parag"]=data['text'].apply(lambda x: len(x.split('\n')))
data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0_level_0,text,type,n_words,n_words_unique,n_stopwords,n_punct,mean_w_len,n_parag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
833042063,Chelsea Handler Admits She’s ‘Very Sexually At...,satire,338,204,126,43,5.192308,18
832959523,How Theresa May Botched\n\nThose were the time...,satire,859,457,325,120,4.668219,56
833039623,Robert Mueller III Rests His Case—Dems NEVER W...,satire,1174,630,453,154,5.045145,40
833032367,Robert Mueller Not Recommending Any More Indic...,satire,630,377,214,89,5.342857,39
814777937,The Far Right Is Trying to Co-opt the Yellow V...,satire,899,472,278,167,5.560623,49


In [None]:
import re

def conntractionsCount(text):
  count = 0
  count += re.subn(r"n\'t",'', text)[1]
  count += re.subn(r"\'re",'', text)[1]
  count += re.subn(r"\'s",'', text)[1]
  count += re.subn(r"\'ll",'', text)[1]
  count += re.subn(r"\'t",'', text)[1]
  count += re.subn(r"\'ve",'', text)[1]
  count += re.subn(r"\'m",'', text)[1]
  return count

data["n_contract"]= data['text'].apply(conntractionsCount)
data.head()

Unnamed: 0_level_0,text,type,n_words,n_words_unique,n_stopwords,n_punct,mean_w_len,n_parag,n_contract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
833042063,Chelsea Handler Admits She’s ‘Very Sexually At...,satire,338,204,126,43,5.192308,18,0
832959523,How Theresa May Botched\n\nThose were the time...,satire,859,457,325,120,4.668219,56,0
833039623,Robert Mueller III Rests His Case—Dems NEVER W...,satire,1174,630,453,154,5.045145,40,0
833032367,Robert Mueller Not Recommending Any More Indic...,satire,630,377,214,89,5.342857,39,4
814777937,The Far Right Is Trying to Co-opt the Yellow V...,satire,899,472,278,167,5.560623,49,3


In [None]:
from textblob import TextBlob
data["polarity"]= data['text'].apply(lambda x: TextBlob(x).sentiment[0])
data["subjectivity"]= data['text'].apply(lambda x: TextBlob(x).sentiment[1])
data.head()

Unnamed: 0_level_0,text,type,n_words,n_words_unique,n_stopwords,n_punct,mean_w_len,n_parag,n_contract,polarity,subjectivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
833042063,Chelsea Handler Admits She’s ‘Very Sexually At...,satire,338,204,126,43,5.192308,18,0,0.251403,0.604804
832959523,How Theresa May Botched\n\nThose were the time...,satire,859,457,325,120,4.668219,56,0,0.03383,0.421623
833039623,Robert Mueller III Rests His Case—Dems NEVER W...,satire,1174,630,453,154,5.045145,40,0,-0.003652,0.456504
833032367,Robert Mueller Not Recommending Any More Indic...,satire,630,377,214,89,5.342857,39,4,0.109822,0.452223
814777937,The Far Right Is Trying to Co-opt the Yellow V...,satire,899,472,278,167,5.560623,49,3,0.04386,0.306572


In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
def pos_count(sent):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    for token in sent:
      if token[1] in ['NN','NNP','NNS']:
        nn_count += 1
      if token[1] in ['PRP','PRP$']:
        pr_count += 1
      if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
        vb_count += 1
      if token[1] in ['JJ','JJR','JJS']:
        jj_count += 1
      if token[1] in ['UH']:
        uh_count += 1
      if token[1] in ['CD']:
        cd_count += 1
        
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])


data[['nn_count', 'pr_count', 'vb_count', 'jj_count', 'uh_count', 'cd_count']]= data['text'].apply(pos_count)
data.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0_level_0,text,type,n_words,n_words_unique,n_stopwords,n_punct,mean_w_len,n_parag,n_contract,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
833042063,Chelsea Handler Admits She’s ‘Very Sexually At...,satire,338,204,126,43,5.192308,18,0,0.251403,0.604804,135,38,64,25,0,2
832959523,How Theresa May Botched\n\nThose were the time...,satire,859,457,325,120,4.668219,56,0,0.03383,0.421623,283,49,158,59,1,23
833039623,Robert Mueller III Rests His Case—Dems NEVER W...,satire,1174,630,453,154,5.045145,40,0,-0.003652,0.456504,390,43,215,98,0,13
833032367,Robert Mueller Not Recommending Any More Indic...,satire,630,377,214,89,5.342857,39,4,0.109822,0.452223,280,38,93,42,2,2
814777937,The Far Right Is Trying to Co-opt the Yellow V...,satire,899,472,278,167,5.560623,49,3,0.04386,0.306572,372,27,146,112,0,6


# BERT Fine-Tuning

In [None]:
# Mount Google Drive to this Notebook instance.
from google.colab import drive
drive.mount('/content/drive')

# Copy the model files back from Google Drive to the Colab instance. 
#!cp -r "./drive/My Drive/BERT Document Classification Tutorial/model_save/" ./model_save/
!cp -r "./drive/My Drive/SemEval2023/t1/data" ./

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 14.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 79.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


## Libraries Mod

In [None]:
import math
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss


class MLP(nn.Module):
    """mlp can specify number of hidden layers and hidden layer channels"""

    def __init__(self, input_dim, output_dim, act='relu', num_hidden_lyr=2,
                 dropout_prob=0.5, return_layer_outs=False,
                 hidden_channels=None, bn=False):
        super().__init__()
        self.out_dim = output_dim
        self.dropout = nn.Dropout(dropout_prob)
        self.return_layer_outs = return_layer_outs
        if not hidden_channels:
            hidden_channels = [input_dim for _ in range(num_hidden_lyr)]
        elif len(hidden_channels) != num_hidden_lyr:
            raise ValueError(
                "number of hidden layers should be the same as the lengh of hidden_channels")
        self.layer_channels = [input_dim] + hidden_channels + [output_dim]
        self.act_name = 'relu'#act
        self.activation = nn.ReLU()#create_act(act)
        self.layers = nn.ModuleList(list(
            map(self.weight_init, [nn.Linear(self.layer_channels[i], self.layer_channels[i + 1])
                                   for i in range(len(self.layer_channels) - 2)])))
        final_layer = nn.Linear(self.layer_channels[-2], self.layer_channels[-1])
        self.weight_init(final_layer,   activation='linear')
        self.layers.append(final_layer)

        self.bn = bn
        if self.bn:
            self.bn = nn.ModuleList([torch.nn.BatchNorm1d(dim) for dim in self.layer_channels[1:-1]])

    def weight_init(self, m, activation=None):
        if activation is None:
            activation = self.act_name
        torch.nn.init.xavier_uniform_(m.weight, gain=nn.init.calculate_gain(activation))
        return m

    def forward(self, x):
        """
        :param x: the input features
        :return: tuple containing output of MLP,
                and list of inputs and outputs at every layer
        """
        layer_inputs = [x]
        for i, layer in enumerate(self.layers):
            input = layer_inputs[-1]
            if layer == self.layers[-1]:
                layer_inputs.append(layer(input))
            else:
                if self.bn:
                    output = self.activation(self.bn[i](layer(input)))
                else:
                    output = self.activation(layer(input))
                layer_inputs.append(self.dropout(output))

        # model.store_layer_output(self, layer_inputs[-1])
        if self.return_layer_outs:
            return layer_inputs[-1], layer_inputs
        else:
            return layer_inputs[-1]

In [None]:
from torch import nn
from transformers import (
    BertForSequenceClassification
)

class BertConcatFeatures(BertForSequenceClassification):
    """
    Bert Model transformer with a sequence classification/regression head as well as
    a TabularFeatCombiner module to combine categorical and numerical features
    with the Bert pooled output

    Parameters:
        hf_model_config (:class:`~transformers.BertConfig`):
            Model configuration class with all the parameters of the model.
            This object must also have a tabular_config member variable that is a
            :obj:`TabularConfig` instance specifying the configs for :obj:`TabularFeatCombiner`
    """

    def __init__(self, hf_model_config):
        super().__init__(hf_model_config)

        # ===================================
        #   FEATURE COMBINATION SETUP
        # ===================================

        self.num_labels = hf_model_config.num_labels

        combined_feat_dim = hf_model_config.text_feat_dim + \
                            hf_model_config.cat_feat_dim + \
                            hf_model_config.numerical_feat_dim
        

        self.num_bn = nn.BatchNorm1d(hf_model_config.numerical_feat_dim)

        # ===================================
        #  MLP SETUP
        # ===================================

        dims=[]
        dim = combined_feat_dim

        while True:
          dim = dim // 4 #could be changed is how is reduced the dimension by layer
          #if resulting layer size is smaller that the num outputs we are done
          if dim <= self.num_labels:
            break
          #if not store as next layer
          dims.append(int(dim))

        print('MLP layer sizes:')
        print(' Input:', combined_feat_dim)
        print(' Hidden:', dims)
        print(' Output:', self.num_labels)

        self.mlp = MLP(combined_feat_dim,
                      self.num_labels,
                      num_hidden_lyr=len(dims),
                      dropout_prob=0.1,#self.mlp_dropout,
                      hidden_channels=dims,
                      bn=True)
        
   # @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        class_weights=None,
        output_attentions=None,
        output_hidden_states=None,
        cat_feats=None,
        numerical_feats=None
    ):
        r"""
        class_weights (:obj:`torch.FloatTensor` of shape :obj:`(tabular_config.num_labels,)`, `optional`, defaults to :obj:`None`):
            Class weights to be used for cross entropy loss function for classification task
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`tabular_config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`tabular_config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        cat_feats (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, tabular_config.cat_feat_dim)`, `optional`, defaults to :obj:`None`):
            Categorical features to be passed in to the TabularFeatCombiner
        numerical_feats (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, tabular_config.numerical_feat_dim)`, `optional`, defaults to :obj:`None`):
            Numerical features to be passed in to the TabularFeatCombiner
    Returns:
        :obj:`tuple` comprising various elements depending on configuration and inputs:
        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
            Classification (or regression if tabular_config.num_labels==1) loss.
        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, tabular_config.num_labels)`):
            Classification (or regression if tabular_config.num_labels==1) scores (before SoftMax).
        classifier_layer_outputs(:obj:`list` of :obj:`torch.FloatTensor`):
            The outputs of each layer of the final classification layers. The 0th index of this list is the
            combining module's output
        """
        
        
        # ===================================
        #               BERT
        # ===================================

        #Run the text through the BERT model invoking self.bert
        #Returns outputs from encoding layers, and not from the final classifier
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        # ===================================
        #           Concat Features
        # ===================================

        numerical_feats = self.num_bn(numerical_feats)

        #Object sizes:
        # pooled_output   [batch size x 768]
        # numerical_feats [batch size x #numerical features]
        # cat_feats       [batch size x #categorical features]


        #Concat everything to one vecto
        combined_feats = torch.cat((pooled_output, cat_feats, numerical_feats), dim=1)


        # ===================================
        #           Output Classifier
        # ===================================

        logits = self.mlp(combined_feats)

        #if type(logits) is tuple:
        logits, classifier_layer_outputs = logits[0], logits[1]
        #else:  # simple classifier
        #classifier_layer_outputs = [combined_feats, logits]

        # ===================================
        #           Output Classifier
        # ===================================
        #Calculate loss only if labels are passed (not in test)
        if labels is not None:
          if self.num_labels == 1:
            #  We are doing regression
            loss_fct = MSELoss()
            labels = labels.float()
            loss = loss_fct(logits.view(-1), labels.view(-1))
          else:
            loss_fct = CrossEntropyLoss(weight=class_weights)
            labels = labels.long()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        else:
          loss = None
        '''
        loss, logits, classifier_layer_outputs = hf_loss_func(combined_feats,
                                                                self.tabular_classifier,
                                                                labels,
                                                                self.num_labels,
                                                                class_weights)
        return loss, logits, classifier_layer_outputs
        '''
        results={'loss': loss,
                 'logits': logits,
                 'classifier_layer_outputs':classifier_layer_outputs}
        return results

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset as TorchDataset


class TorchTabularTextDataset(TorchDataset):
    """
    :obj:`TorchDataset` wrapper for text dataset with categorical features
    and numerical features

    Parameters:
        encodings (:class:`transformers.BatchEncoding`):
            The output from encode_plus() and batch_encode() methods (tokens, attention_masks, etc) of
            a transformers.PreTrainedTokenizer
        categorical_feats (:class:`numpy.ndarray`, of shape :obj:`(n_examples, categorical feat dim)`, `optional`, defaults to :obj:`None`):
            An array containing the preprocessed categorical features
        numerical_feats (:class:`numpy.ndarray`, of shape :obj:`(n_examples, numerical feat dim)`, `optional`, defaults to :obj:`None`):
            An array containing the preprocessed numerical features
        labels (:class: list` or `numpy.ndarray`, `optional`, defaults to :obj:`None`):
            The labels of the training examples
        class_weights (:class:`numpy.ndarray`, of shape (n_classes),  `optional`, defaults to :obj:`None`):
            Class weights used for cross entropy loss for classification
        df (:class:`pandas.DataFrame`, `optional`, defaults to :obj:`None`):
            Model configuration class with all the parameters of the model.
            This object must also have a tabular_config member variable that is a
            TabularConfig instance specifying the configs for TabularFeatCombiner

    """
    def __init__(self,
                 encodings,
                 categorical_feats,
                 numerical_feats,
                 labels=None,
                 df=None,
                 label_list=None,
                 class_weights=None
                 ):
        self.df = df
        self.encodings = encodings
        self.cat_feats = categorical_feats
        self.numerical_feats = numerical_feats
        self.labels = labels
        self.class_weights = class_weights
        self.label_list = label_list if label_list is not None else [i for i in range(len(np.unique(labels)))]

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]) if self.labels is not None  else None
        item['cat_feats'] = torch.tensor(self.cat_feats[idx]).float() \
            if self.cat_feats is not None else torch.zeros(0)
        item['numerical_feats'] = torch.tensor(self.numerical_feats[idx]).float()\
            if self.numerical_feats is not None else torch.zeros(0)
        return item

    def __len__(self):
        return len(self.labels)

    def get_labels(self):
        """returns the label names for classification"""
        return self.label_list


## Prepare Text Features - Tokenize and encode text

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
'''print('   Min length: {:,} tokens'.format(min(lengths)))
print('   Max length: {:,} tokens'.format(max(lengths)))
print('Median length: {:,} tokens'.format(np.median(lengths)))'''

   Min length: 145 tokens
   Max length: 512 tokens
Median length: 512.0 tokens


In [None]:
'''# Count the number of sentences that had to be truncated to 512 tokens.
num_truncated = lengths.count(512)

# Compare this to the total number of training sentences.
num_sentences = len(lengths)
prcnt = float(num_truncated) / float(num_sentences)

print('{:,} of {:,} articles ({:.1%}) in the training set are longer than 512 tokens.'.format(num_truncated, num_sentences, prcnt))
'''

449 of 625 articles (71.8%) in the training set are longer than 512 tokens.


In [None]:
import types
#import logger
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
from functools import partial
def convert_to_func(container_arg):
    """convert container_arg to function that returns True if an element is in container_arg"""
    if container_arg is None:
        return lambda df, x: False
    if not isinstance(container_arg, types.FunctionType):
        #print(type(container_arg))
        assert type(container_arg) is list or type(container_arg) is set
        return lambda df, x: x in container_arg
    else:
        return container_arg

def load_num_feats(df, num_bool_func):
    num_cols = get_matching_cols(df, num_bool_func)
    print(f'{len(num_cols)} numerical columns')
    df = df.copy()
    df[num_cols] = df[num_cols].astype(float)
    df[num_cols] = df[num_cols].fillna(dict(df[num_cols].median()), inplace=False)
    if len(num_cols) == 0:
        return None
    return df[num_cols].values

def load_cat_feats(df, cat_bool_func, encode_type=None):
    """load categorical features from DataFrame and do encoding if specified"""
    cat_cols = get_matching_cols(df, cat_bool_func)
    print(f'{len(cat_cols)} categorical columns')
    if len(cat_cols) == 0:
        return None
    #cat_feat_processor = CategoricalFeatures(df, cat_cols, encode_type)
    return None#cat_feat_processor.fit_transform()

def get_matching_cols(df, col_match_func):
    return [c for c in df.columns if col_match_func(df, c)]

def load_cat_and_num_feats(df, cat_bool_func, num_bool_func, enocde_type=None):
    cat_feats = load_cat_feats(df, cat_bool_func, enocde_type)
    num_feats = load_num_feats(df, num_bool_func)
    return cat_feats, num_feats

def normalize_numerical_feats(numerical_feats, transformer=None):
    if numerical_feats is None or transformer is None:
        return numerical_feats
    return transformer.transform(numerical_feats)


def load_data(data_df,
              text_cols,
              tokenizer,
              label_col,
              label_list=None,
              categorical_cols=None,
              numerical_cols=None,
              sep_text_token_str=' ',
              categorical_encode_type='ohe',
              numerical_transformer=None,
              empty_text_values=None,
              replace_empty_text=None,
              max_token_length=None,
              debug=False,
              ):
    
    text_cols_func = convert_to_func(text_cols)
    categorical_cols_func = convert_to_func(categorical_cols)
    numerical_cols_func = convert_to_func(numerical_cols)

    categorical_feats, numerical_feats = load_cat_and_num_feats(data_df,
                                                                categorical_cols_func,
                                                                numerical_cols_func,
                                                                categorical_encode_type)
    numerical_feats = normalize_numerical_feats(numerical_feats, numerical_transformer)
    texts_cols = get_matching_cols(data_df, text_cols_func)
    print(f'Text columns: {texts_cols}')
    texts_list = data_df[texts_cols]
    print(f'Raw text example: {texts_list.text.iloc[0]}')
    hf_model_text_input = tokenizer(list(data_df.text.values), padding=True, truncation=True,
                                    max_length=max_token_length)
    tokenized_text_ex = ' '.join(tokenizer.convert_ids_to_tokens(hf_model_text_input['input_ids'][0]))
    print(f'Tokenized text example: {tokenized_text_ex}')
    labels = data_df[label_col].values

    return TorchTabularTextDataset(hf_model_text_input, categorical_feats, numerical_feats, labels, data_df, label_list)

def load_train_val_test_helper(train_df,
                               val_df,
                               test_df,
                               text_cols,
                               tokenizer,
                               label_col,
                               label_list=None,
                               categorical_cols=None,
                               numerical_cols=None,
                               sep_text_token_str=' ',
                               categorical_encode_type='ohe',
                               numerical_transformer_method='quantile_normal',
                               empty_text_values=None,
                               replace_empty_text=None,
                               max_token_length=None,
                               debug=False):
  numerical_transformer = QuantileTransformer(output_distribution='normal')
  num_feats = load_num_feats(train_df, convert_to_func(numerical_cols))
  numerical_transformer.fit(num_feats)
  train_dataset = load_data(train_df,
                            text_cols,
                            tokenizer,
                            label_col,
                            label_list,
                            categorical_cols,
                            numerical_cols,
                            sep_text_token_str,
                            categorical_encode_type,
                            numerical_transformer,
                            empty_text_values,
                            replace_empty_text,
                            max_token_length,
                            debug
                            )
  test_dataset = load_data(test_df,
                          text_cols,
                          tokenizer,
                          label_col,
                          label_list,
                          categorical_cols,
                          numerical_cols,
                          sep_text_token_str,
                          categorical_encode_type,
                          numerical_transformer,
                          empty_text_values,
                          replace_empty_text,
                          max_token_length,
                          debug
                          )
  val_dataset = load_data(val_df,
                          text_cols,
                          tokenizer,
                          label_col,
                          label_list,
                          categorical_cols,
                          numerical_cols,
                          sep_text_token_str,
                          categorical_encode_type,
                          numerical_transformer,
                          empty_text_values,
                          replace_empty_text,
                          max_token_length,
                          debug
                          )

  return train_dataset, val_dataset, test_dataset

In [None]:
text_cols = ['text']
# The label col is expected to contain integers from 0 to N_classes - 1
label_col = 'type_encoded' 
categorical_cols = []
numerical_cols = ['n_words', 'n_words_unique', 'n_stopwords', 'n_punct', 'mean_w_len', 'n_parag', 'n_contract', 'polarity', 'subjectivity', 'nn_count', 'pr_count', 'vb_count', 'jj_count', 'uh_count', 'cd_count']
label_list = ['opinion', 'reporting', 'satire'] # what each label class represents

num_splits=4
validation_ratio=0.1
max_token_length=512

from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

data = pd.read_csv('./data/train_t1_nlp.csv', index_col=0)

train_df, val_df = train_test_split(data, test_size=validation_ratio, shuffle=True,
                                        train_size=1-validation_ratio, random_state=5)

test_df = pd.read_csv('./data/dev_t1_nlp.csv', index_col=0)

#encode the label
from sklearn.preprocessing import LabelEncoder
articles = data.text.values

le = LabelEncoder()
labels = le.fit_transform(data['type']) # encode labels as ints

counter = 0
for entry in le.classes_:
  print(f"{counter}: {entry}")
  counter += 1

data["type_encoded"] = labels # add col to train_df with ints for labels

#adding values to label to avoid error in code for test set unknown in competition
test_df['type_encoded']=train_df[:len(test_df)]['type_encoded'].values

#val_df =None

train_dataset, val_dataset, test_dataset = load_train_val_test_helper(train_df, val_df,
                                              test_df,
                                              text_cols, tokenizer,
                                              label_col,
                                              label_list,
                                              categorical_cols,
                                              numerical_cols,
                                              #sep_text_token_str,
                                              #categorical_encode_type,
                                              #numerical_transformer_method,
                                              #empty_text_values,
                                              #replace_empty_text,
                                              max_token_length#,
                                              #debug
                                              )
train_datasets = [train_dataset]
val_datasets = [val_dataset]
test_datasets = [test_dataset]
train_dataset = train_datasets[0]

Loading BERT tokenizer...
0: opinion
1: reporting
2: satire
15 numerical columns
0 categorical columns
15 numerical columns
Text columns: ['text']
Raw text example: Trump threatens military closure at US border to stop migrants

Personal Liberty Poll Exercise your right to vote.
President Donald Trump said he’ll mobilize the U.S. military to close the border with Mexico to stop an “assault” on the nation by a caravan of migrants from Central America, according to a report by Bloomberg.com.
Trump, who ran in 2016 promising to tighten U.S. immigration laws and stanch the inflow of undocumented migrants, has called for cutting off foreign aid to Guatemala, Honduras and El Salvador if they don’t stop the migrants. He claimed Thursday — without providing evidence — that Democrats are backing the human movement to bolster their case for “open borders and existing weak laws.”
“In addition to stopping all payments to these countries, which seem to have almost no control over their population, 



Tokenized text example: [CLS] trump threatens military closure at us border to stop migrants personal liberty poll exercise your right to vote . president donald trump said he ’ ll mob ##ili ##ze the u . s . military to close the border with mexico to stop an “ assault ” on the nation by a caravan of migrants from central america , according to a report by bloomberg . com . trump , who ran in 2016 promising to tighten u . s . immigration laws and stan ##ch the in ##flow of undo ##cum ##ented migrants , has called for cutting off foreign aid to guatemala , honduras and el salvador if they don ’ t stop the migrants . he claimed thursday — without providing evidence — that democrats are backing the human movement to bo ##lster their case for “ open borders and existing weak laws . ” “ in addition to stopping all payments to these countries , which seem to have almost no control over their population , i must , in the strongest of terms , ask mexico to stop this onslaught — and if unable t

## Train Our Classification Model

In [None]:
import torch
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
from transformers import BertConfig

config = BertConfig.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3) # The number of output labels--2 for binary classification. You can increase this for multi-class tasks.  

config.numerical_feat_dim = 15 #numerical_feats.size()[1] #train_dataset.numerical_feats.shape[1]
config.cat_feat_dim = 0 #categorical_feats.size()[1]

config.text_feat_dim = config.hidden_size #768 for BERT
# Load the modified BERT
model = BertConcatFeatures.from_pretrained(
    "bert-base-uncased",
    config=config
)

# Tell pytorch to run this model on the GPU.
desc = model.cuda()
#model

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

MLP layer sizes:
 Input: 783
 Hidden: [195, 48, 12]
 Output: 3


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertConcatFeatures: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertConcatFeatures from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertConcatFeatures from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertConcatFeatures were not initialized from the model checkpoint at bert-base-uncased and are newly initialized

In [None]:
batch_size = 16
learning_rate = 3e-3 #grater because the MLP should be trained it has randomly inizialited weights
epochs = 10
max_len= 512 #set previous to tokenization and encoding the text
print('Using maximum sequence length:', max_len)

Using maximum sequence length: 512


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Create the DataLoader for our training set.
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_sampler = SequentialSampler(val_dataset)
validation_dataloader = DataLoader(val_dataset, sampler=validation_sampler, batch_size=batch_size)

In [None]:
from torch.optim import AdamW
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

### 4.1. BertForSequenceClassification

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
#epochs = 4

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 100 batches.
        if step % 100 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch["input_ids"].to(device)
        print(b_input_ids[0])
        b_categ_feats = batch["cat_feats"].to(device)
        b_numer_feats = batch["numerical_feats"].to(device)
        b_input_mask = batch["attention_mask"].to(device)
        b_labels = batch["labels"].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # In PyTorch, calling `model` will in turn call the model's `forward` 
        # function and pass down the arguments. The `forward` function is 
        # documented here: 
        # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
        # The results are returned in a results object, documented here:
        # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.SequenceClassifierOutput
        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       cat_feats = b_categ_feats,
                       numerical_feats = b_numer_feats)
                       #return_dict=True)

        #loss = result.loss
        loss = result['loss']
        logits = result['logits']

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        #print(batch)
        #batch = tuple(batch[t].to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        #b_input_ids, b_token_type_ids, b_input_mask, b_labels, b_categ_feats, b_numer_feats = batch
        b_input_ids = batch["input_ids"].to(device)
        b_categ_feats = batch["cat_feats"].to(device)
        b_numer_feats = batch["numerical_feats"].to(device)
        b_input_mask = batch["attention_mask"].to(device)
        b_labels = batch["labels"].to(device)
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            result = model(b_input_ids, 
                          token_type_ids=None, 
                          attention_mask=b_input_mask, 
                          labels=b_labels,
                          cat_feats = b_categ_feats,
                          numerical_feats = b_numer_feats)

        # Get the loss and "logits" output by the model. The "logits" are the 
        # output values prior to applying an activation function like the 
        # softmax.
        loss = result['loss']
        logits = result['logits']

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

## Dev Set

In [None]:
from transformers import BertTokenizer
#device = torch.device("cpu")

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
#load saved model
from transformers import BertConfig

config = BertConfig.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 3) # The number of output labels--2 for binary classification. You can increase this for multi-class tasks.  

config.numerical_feat_dim = 15 #numerical_feats.size()[1] #train_dataset.numerical_feats.shape[1]
config.cat_feat_dim = 0 #categorical_feats.size()[1]

config.text_feat_dim = config.hidden_size #768 for BERT
# Load the modified BERT
model = BertConcatFeatures.from_pretrained(
    "./drive/My Drive/SemEval2023/t1/models/multifeatures/model_save",
    config=config
)

# Tell pytorch to run this model on the GPU.
desc = model.cuda()


Loading BERT tokenizer...
MLP layer sizes:
 Input: 783
 Hidden: [195, 48, 12]
 Output: 3


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size=1
# Create the DataLoader for our test set.
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test articles...'.format(len(test_dataset)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Measure elapsed time.
t0 = time.time()
out=[]
# Predict 
for (step, batch) in enumerate(test_dataloader):
    
    # Add batch to GPU
    #batch = tuple(t.to(device) for t in batch)
  
    # Progress update every 100 batches.
    if step % 10 == 0 and not step == 0:
        # Calculate elapsed time in minutes.
        elapsed = format_time(time.time() - t0)
        
        # Report progress.
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    #print(batch)

    # Unpack the inputs from our dataloader
    b_input_ids = batch["input_ids"].to(device)
    b_categ_feats = batch["cat_feats"].to(device)
    b_numer_feats = batch["numerical_feats"].to(device)
    b_input_mask = batch["attention_mask"].to(device)
    b_labels = batch["labels"].to(device)
    
    # Telling the model not to compute or store gradients, saving memory and
    # speeding up validation
    with torch.no_grad():
      result = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask, 
                          labels=b_labels,
                          cat_feats = b_categ_feats,
                          numerical_feats = b_numer_feats)
      
    logits = result["logits"]
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 83 test articles...
  Batch    10  of     83.    Elapsed: 0:00:00.
  Batch    20  of     83.    Elapsed: 0:00:01.
  Batch    30  of     83.    Elapsed: 0:00:01.
  Batch    40  of     83.    Elapsed: 0:00:01.
  Batch    50  of     83.    Elapsed: 0:00:02.
  Batch    60  of     83.    Elapsed: 0:00:02.
  Batch    70  of     83.    Elapsed: 0:00:02.
  Batch    80  of     83.    Elapsed: 0:00:03.
    DONE.


In [None]:
pred_flat = np.argmax(predictions, axis=1).flatten()
pred_flat

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

In [None]:
predictions

[array([[ 0.00174322, -0.02498835,  0.04351984]], dtype=float32),
 array([[ 0.02438149, -0.02318301,  0.01149505]], dtype=float32),
 array([[ 0.00874922, -0.03220041,  0.05335634]], dtype=float32),
 array([[ 0.05109492, -0.0540915 ,  0.01314934]], dtype=float32),
 array([[ 0.04506254, -0.0419168 ,  0.00394178]], dtype=float32),
 array([[ 0.05673124, -0.05452375,  0.00840827]], dtype=float32),
 array([[ 0.01981907, -0.00300261,  0.02190955]], dtype=float32),
 array([[ 0.05987692, -0.06959213, -0.00100047]], dtype=float32),
 array([[ 0.02590231, -0.02636886,  0.01477152]], dtype=float32),
 array([[ 0.01179087, -0.00443309,  0.02531411]], dtype=float32),
 array([[ 0.05612709, -0.07254086,  0.0212057 ]], dtype=float32),
 array([[ 0.00581575, -0.01738445,  0.02828153]], dtype=float32),
 array([[ 0.05396029, -0.03332298, -0.03130833]], dtype=float32),
 array([[ 0.04896293, -0.03744278,  0.01004206]], dtype=float32),
 array([[ 0.05221582, -0.0538747 , -0.00391109]], dtype=float32),
 array([[ 

In [None]:
len(test_dataset)

83

In [None]:
from transformers import trainer
trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=build_compute_metrics_fn(task),
        )

In [None]:
# Combine the results across the batches.
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

In [None]:
# Our performance metric for the test set.
from sklearn.metrics import roc_auc_score

# Use the model output for label 1 as our predictions.
p1 = predictions[:,1]

# Calculate the ROC AUC.
auc = roc_auc_score(true_labels, p1)

print('Test ROC AUC: %.3f' %auc)

Test ROC AUC: 0.974


## Save Trained Model


In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))

Saving model to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json')

In [None]:
import os
gdrive_path = "./drive/My Drive/BERTt1/model_save/"

# Create output directory if needed
if not os.path.exists(gdrive_path):
    os.makedirs(gdrive_path)

# Copy the model files to a directory in your Google Drive.
!cp -r ./model_save/ gdrive_path