### Sentiment Analysis on IMDb Movie Reviews dataset performed with a barebone reimplementation of the NER task of huggingface's BERT/transformers package.

**Imports**

In [0]:
!pip install seqeval transformers

import numpy as np
import os
import pandas as pd
import random
import spacy
import tarfile
import time
import torch
import transformers as ppb
import urllib.request as urlr

from sklearn.metrics import classification_report
from transformers.data.processors.utils import InputExample
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm, trange

**Download the dataset**

In [0]:
# Paths
data_url = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
data_tar_path = 'aclImdb_v1.tar.gz'

if not os.path.exists(data_tar_path):
    urlr.urlretrieve(data_url, data_tar_path)
    tar = tarfile.open(data_tar_path)
    tar.extractall()
    tar.close()

**Initialize BERT model**

In [0]:
# Args
model_type = 'bert-base-cased'
cache_dir = 'models'

# Tokenizer
tokenizer = ppb.BertTokenizer.from_pretrained(model_type, cache_dir=cache_dir)

# Model
model = ppb.BertForSequenceClassification.from_pretrained(model_type, cache_dir=cache_dir)

**Method to read in single files from dataset**

In [0]:
data_dir = 'aclImdb'
label_subdirs = ['neg', 'pos']


def read_data_from_directory(data_dir, mode, perc=1.0):
    """ Modified version from transformers/examples/utils_ner.py for datasets with one example per file """
    docs = []
    idx = 1
    for sub in label_subdirs:
        sub_files = [os.path.join(data_dir, sub, f) for f in os.listdir(os.path.join(data_dir, sub))]
        cutoff = int(len(sub_files) * perc)
        random.shuffle(sub_files)
        if perc < 1.0:
            sub_files = sub_files[:cutoff]
        print('Processing {} files for {}'.format(len(sub_files), mode))
        for doc in sub_files:
            contents = open(doc).read()
            # Remove linebreaks
            contents = contents.replace('<br /><br />', '')
            # Remove title
            contents = contents.split('*******')[0]
            
            # Remove label if test
            if mode == 'test':
                label = None
            
            docs.append(InputExample(guid='{}-{}'.format(mode, idx),
                                     text_a=contents,
                                     label=sub))
            idx += 1
    return docs

**Feature converter** (from utils_ner.py)

In [0]:
spacy_nlp = spacy.load('en_core_web_sm')


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label = label

def convert_examples_to_features(examples,
                                 label_list,
                                 max_seq_length,
                                 tokenizer,
                                 cls_token_at_end=False,
                                 cls_token="[CLS]",
                                 cls_token_segment_id=1,
                                 sep_token="[SEP]",
                                 sep_token_extra=False,
                                 pad_on_left=False,
                                 pad_token=0,
                                 pad_token_segment_id=0,
                                 pad_token_label_id=-1,
                                 sequence_a_segment_id=0,
                                 mask_padding_with_zero=True):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 100 == 0:
            print("Writing example {} of {}".format(ex_index, len(examples)))

        tokens = []
        label = label_map[example.label]
        # Apply spacy tokenization to every document
        doc = spacy_nlp(example.text_a)
        for word in [token.text for token in doc]:
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[:(max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        else:
            input_ids += ([pad_token] * padding_length)
            input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
            segment_ids += ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 3:
            print("*** Example ***")
            print("guid: %s", example.guid)
            print("tokens: %s", " ".join([str(x) for x in tokens]))
            print("input_ids: %s", " ".join([str(x) for x in input_ids]))
            print("input_mask: %s", " ".join([str(x) for x in input_mask]))
            print("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
            print("label: {}".format(label))

        
            
        features.append(
                InputFeatures(input_ids,
                              input_mask,
                              segment_ids,
                              label))

    return features

**Prepare training and test set**

In [0]:
max_seq_length = 128
labels = ['neg', 'pos']
all_data = []  # List of DataLoaders

for subset in ['train', 'test']:
    data_partition = read_data_from_directory(os.path.join(data_dir, subset), subset, perc=.02)
    features = convert_examples_to_features(data_partition, labels, max_seq_length, tokenizer)
    
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_labels)
    
    if subset == 'train':
        train_batch_size = 32
        train_sampler = RandomSampler(dataset)
        dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=train_batch_size)
    else:
        eval_batch_size = 8
        eval_sampler = SequentialSampler(dataset)
        dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=eval_batch_size)
        
    all_data.append(dataloader)

**Evaluation**

In [0]:
# Evaluation function
def evaluate(dataloader):
    # Variables
    ev_loss = 0.0
    nb_eval_steps = 0
    true_labels, preds = [], []
    
    model.eval()
    for batch in tqdm(dataloader, desc='Evaluating'):
        batch = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            # Forward propagation
            inputs = {"input_ids": batch[0],
                      "attention_mask": batch[1],
                      "token_type_ids": batch[2],
                      "labels": batch[3]}
            out = model(**inputs)
            # out : tuple of loss and logits

            tmp_eval_loss, logits = out[:2]
            ev_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        
        preds.extend(logits.detach().cpu().numpy())
        true_labels.extend(inputs["labels"].detach().cpu().numpy())
        
    ev_loss /= nb_eval_steps
    preds = np.argmax(preds, axis=1)
    
    eval_metrics = classification_report(true_labels, preds, output_dict=True)['weighted avg']
        
    results = {
        "loss": ev_loss,
        "precision": eval_metrics['precision'],
        "recall": eval_metrics['recall'],
        "f1": eval_metrics['f1-score']}
    
    for key in sorted(results.keys()):
        print("{} : {}".format(key, str(results[key])))
        
    return results, preds
    

**Training**

In [0]:
# Training params
gradient_acc_steps = 1
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data
train_dataloader = all_data[0]

# Optimizer
optimizer = ppb.AdamW(model.parameters())
scheduler = ppb.WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=1 + (len(dataloader) // gradient_acc_steps))

global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()

tr_it = trange(int(num_epochs), desc='Epoch', ascii=True)

for _ in tr_it:
    epoch_it = tqdm(train_dataloader, desc='Iteration', ascii=True)
    
    for step, batch in enumerate(epoch_it):
        time.sleep(0.01)
        model.train()
        batch = tuple(t.to(device) for t in batch)
        
        # Forward propagation
        inputs = {"input_ids": batch[0],
                  "attention_mask": batch[1],
                  "token_type_ids": batch[2],
                  "labels": batch[3]}
        out = model(**inputs)
        # out : tuple of loss and logits
        
        # Backpropagate loss
        loss = out[0]
        loss.backward()
        tr_loss += loss.item()
        
        if (step + 1) % gradient_acc_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 3.0)
            
            # Update scheduler, optimizer
            scheduler.step()
            optimizer.step()
            model.zero_grad()
            global_step += 1
            
            # Evaluate
            evaluate(all_data[1])
                