### Distilbert con k-fold cv

Distil-bert applied to IMdb

* max_lentgh = 300
* FRAC = 1
* k fold cv (folds = 5)

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

# Transformers
from datasets import Dataset

import torch

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import Trainer

In [2]:
# utility functions

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    
    return {"accuracy": acc, "f1": f1}

In [3]:
# globals
EPOCHS = 3
# changed from 5
N_FOLDS = 5

# max length for text (truncation), see eda1 NB
MAX_LENGTH = 300

# we define here the pre-trained model we're using
MODEL_CKPT = "distilbert-base-uncased"

### Prepare train and validation dataset

In [4]:
# it takes some time... good to have a progress bar
basepath = 'aclImdb'
NUM_FILES = 50000

labels = {'pos': 1, 'neg': 0}

df = pd.DataFrame()

with tqdm(total=NUM_FILES) as pbar:
    for s in ('test', 'train'):
        for l in ('pos', 'neg'):
            path = os.path.join(basepath, s, l)
            for file in sorted(os.listdir(path)):
                with open(os.path.join(path, file), 
                          'r', encoding='utf-8') as infile:
                    txt = infile.read()
                df = df.append([[txt, labels[l]]], 
                               ignore_index=True)
                
                pbar.update(1)
                
df.columns = ['text', 'target']

100%|██████████| 50000/50000 [01:34<00:00, 529.03it/s]


In [5]:
df.head()

Unnamed: 0,text,target
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [6]:
# shuffle
FRAC = 1

df = df.sample(frac=FRAC)

In [7]:
# num of distinct labels
NUM_LABELS = df['target'].nunique()

### for text Tokenization

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

In [13]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=MAX_LENGTH)

### k-fold training

In [14]:
# split in train and validation
# Trainer expects a column called label
USED_COLUMNS = ['text', 'target']

SEED = 1432

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# to do training on GPU, if available (I'm using a P100)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

avg_acc_score = 0.

for i, (train_idx, valid_idx) in enumerate(kf.split(df)):
    print()
    print("Processing fold:", i + 1)

    # here we split the DataFrame, using the indexes for the fold

    train_df = df.iloc[train_idx][USED_COLUMNS]
    valid_df = df.iloc[valid_idx][USED_COLUMNS]

    print(f"There are {train_df.shape[0]} samples in train set")
    print(f"There are {valid_df.shape[0]} samples in valid set")
    
    # rename rating to label
    train_df = train_df.rename(columns={"target": "label"})
    valid_df = valid_df.rename(columns={"target": "label"})
    
    ds_train = Dataset.from_pandas(train_df.reset_index(drop=True))
    ds_valid = Dataset.from_pandas(valid_df.reset_index(drop=True))
    
    ds_train_encoded = ds_train.map(tokenize, batched=True, batch_size=None)
    ds_valid_encoded = ds_valid.map(tokenize, batched=True, batch_size=None)
    
    # prepare the training on GPU (if available)
    model = (AutoModelForSequenceClassification
             .from_pretrained(MODEL_CKPT, num_labels=NUM_LABELS).to(device))
    
    batch_size = 32
    batch_size_eval = 8

    logging_steps = len(ds_train_encoded) // batch_size

    model_name = f"{MODEL_CKPT}-finetuned-tweets"

    training_args = TrainingArguments(output_dir=model_name,
                                      num_train_epochs=EPOCHS,
                                      learning_rate=2e-5,
                                      per_device_train_batch_size=batch_size,
                                      per_device_eval_batch_size=batch_size_eval,
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      save_strategy="epoch",   
                                      disable_tqdm=False,
                                      logging_steps=logging_steps,
                                      push_to_hub=False, 
                                      log_level="error",
                                      load_best_model_at_end=True,
                                      # disable wanddb logging
                                      report_to="none"
                                     )
    
    trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=ds_train_encoded,
                  eval_dataset=ds_valid_encoded,
                  tokenizer=tokenizer)
    trainer.train();
    
    # save the model (label with fold id)
    SAVED_MODEL = f"saved_models/fold{i}"

    trainer.save_model(f'{SAVED_MODEL}')
    
    # compute avg f1-score
    acc = trainer.predict(ds_valid_encoded).metrics['test_accuracy']
    
    avg_acc_score += acc/N_FOLDS


Processing fold: 1
There are 40000 samples in train set
There are 10000 samples in valid set


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.257,0.212051,0.9138,0.913709
2,0.1539,0.206692,0.9251,0.925087
3,0.0956,0.262592,0.9237,0.9237



Processing fold: 2
There are 40000 samples in train set
There are 10000 samples in valid set


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2611,0.200948,0.921,0.920964
2,0.154,0.202859,0.9267,0.926684
3,0.0985,0.250743,0.9277,0.927699



Processing fold: 3
There are 40000 samples in train set
There are 10000 samples in valid set


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2556,0.205512,0.9207,0.920699
2,0.153,0.230111,0.9221,0.922087
3,0.0958,0.255869,0.9249,0.9249



Processing fold: 4
There are 40000 samples in train set
There are 10000 samples in valid set


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2574,0.220832,0.9164,0.91633
2,0.1549,0.238321,0.9156,0.915458
3,0.0971,0.262709,0.9254,0.925396



Processing fold: 5
There are 40000 samples in train set
There are 10000 samples in valid set


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2588,0.21094,0.9171,0.917043
2,0.1531,0.216229,0.9231,0.923084
3,0.095,0.251844,0.9267,0.926698


In [15]:
# avg accuracy score across folds
print(f"avg acc score is: {round(avg_acc_score, 4)}")

avg acc score is: 0.9201


### Final remarks: 

The result obtained with distil-bert is the best so far:

ACC = 0.92

an improvement, if compared with ACC = 0.866 obtained with tf-idf