Download libraries in colab

In [1]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.5 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.2.1-py3-none-any.whl (342 kB)
[K     |████████████████████████████████| 342 kB 36.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 38.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 2.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 23.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K  

Install Hugginface transformers library, its respective BERT-based tokenizer, and pre-trained model

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=11)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Upload train and development data

In [3]:
from google.colab import files
files.upload()

Saving devel_labels.csv to devel_labels.csv
Saving devel_sentences.csv to devel_sentences.csv
Saving train_labels.csv to train_labels.csv
Saving train_sentences.csv to train_sentences.csv


{'devel_labels.csv': b'name,L1\r\ndevel_0001.wav,GER\r\ndevel_0002.wav,HIN\r\ndevel_0003.wav,KOR\r\ndevel_0004.wav,TEL\r\ndevel_0005.wav,ARA\r\ndevel_0006.wav,ARA\r\ndevel_0007.wav,TEL\r\ndevel_0008.wav,TEL\r\ndevel_0009.wav,TUR\r\ndevel_0010.wav,TUR\r\ndevel_0011.wav,ITA\r\ndevel_0012.wav,JPN\r\ndevel_0013.wav,TUR\r\ndevel_0014.wav,FRE\r\ndevel_0015.wav,KOR\r\ndevel_0016.wav,HIN\r\ndevel_0017.wav,CHI\r\ndevel_0018.wav,ARA\r\ndevel_0019.wav,TEL\r\ndevel_0020.wav,ITA\r\ndevel_0021.wav,KOR\r\ndevel_0022.wav,KOR\r\ndevel_0023.wav,KOR\r\ndevel_0024.wav,KOR\r\ndevel_0025.wav,SPA\r\ndevel_0026.wav,ARA\r\ndevel_0027.wav,JPN\r\ndevel_0028.wav,JPN\r\ndevel_0029.wav,TUR\r\ndevel_0030.wav,GER\r\ndevel_0031.wav,JPN\r\ndevel_0032.wav,ARA\r\ndevel_0033.wav,HIN\r\ndevel_0034.wav,JPN\r\ndevel_0035.wav,ARA\r\ndevel_0036.wav,CHI\r\ndevel_0037.wav,GER\r\ndevel_0038.wav,TUR\r\ndevel_0039.wav,JPN\r\ndevel_0040.wav,TUR\r\ndevel_0041.wav,TEL\r\ndevel_0042.wav,GER\r\ndevel_0043.wav,TUR\r\ndevel_0044.wav,SPA\r

Connect Google drive to colab

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Read data files

In [5]:
import pandas as pd
from sklearn import preprocessing

train_sentences = pd.read_csv("train_sentences.csv")["sentence"].values.tolist()
devel_sentences = pd.read_csv("devel_sentences.csv")["sentence"].values.tolist()

train_labels_categorical = pd.read_csv("train_labels.csv")["L1"].values
devel_labels_categorical = pd.read_csv("devel_labels.csv")["L1"].values

label_encoder = preprocessing.LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels_categorical).tolist()
devel_labels = label_encoder.transform(devel_labels_categorical).tolist()

Tokenize the sentences

In [6]:
train_encodings = tokenizer(train_sentences, padding=True, truncation=True, return_tensors="pt")
devel_encodings = tokenizer(devel_sentences, padding=True, truncation=True, return_tensors="pt")

Convert encodings and labels to readable format

In [7]:
import torch

class DSNLI(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DSNLI(train_encodings, train_labels)
devel_dataset = DSNLI(devel_encodings, devel_labels)

Use GPU if available

In [8]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

torch.cuda.empty_cache()

model = model.to(device)

print(f"Using device: {device}")

Using device: cuda:0


Set training hyperparameters

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="results", 
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
)

Set evaluation metric

In [10]:
import numpy as np
from datasets import load_metric

metric = load_metric("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

Downloading builder script:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

Set training components

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=devel_dataset,
    compute_metrics=compute_metrics,
)

Start training

In [12]:
trainer.train()

***** Running training *****
  Num examples = 3300
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1040
  if __name__ == '__main__':


Epoch,Training Loss,Validation Loss,Recall
1,No log,2.126105,0.297813
2,No log,1.890294,0.332731
3,No log,1.878754,0.394732
4,No log,1.767818,0.405927
5,1.513100,1.870492,0.420534
6,1.513100,2.062961,0.411007
7,1.513100,2.254435,0.420735
8,1.513100,2.429422,0.425977
9,1.513100,2.532377,0.432806
10,0.139300,2.548814,0.43591


***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 965
  Batch size = 32
  if __name__ == '__main__':
***** R

TrainOutput(global_step=1040, training_loss=0.7951788840958706, metrics={'train_runtime': 2143.1442, 'train_samples_per_second': 15.398, 'train_steps_per_second': 0.485, 'total_flos': 3374980319502000.0, 'train_loss': 0.7951788840958706, 'epoch': 10.0})

Move results folder to drive

In [13]:
!mv "/content/results" "/content/drive/My Drive/"