## Import and preprocessing

In [3]:
# imports for preprocessing
import numpy as np
import nltk
import csv
# importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet')
from collections import Counter
# from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import WordPunctTokenizer
import regex

# imports for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertModel, BertTokenizerFast
from sklearn.decomposition import PCA
from collections import defaultdict
import wandb
from prettytable import PrettyTable

# imports for modeling
import torch
from transformers import Trainer, TrainingArguments
import pandas as pd
from sklearn.metrics import f1_score
from transformers import BertForSequenceClassification, BertTokenizer, AutoConfig ,TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import optuna
import os
from pathlib import Path


# imports for pruning
import copy
import torch.nn.utils.prune as prune
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\levan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
home_dir = os.getcwd()
output_dir = Path(home_dir, 'results')

In [5]:
home_dir

'c:\\Users\\levan\\Documents\\מסמכים של ניר\\אוניברסיטת תל אביב\\2022-2023\\נושאים מתקדמים בלמידה עמוקה\\פרויקט\\news-dl-project'

## Prepare the data

In [6]:
balanced_df = pd.read_csv('balanced_df.csv')

In [7]:
# if we run on cpu
# balanced_df = balanced_df.iloc[:100]

In [9]:
headlines = balanced_df['text']
categories = balanced_df['updated_category']

# encode the labels to number
label_encoder = LabelEncoder()
balanced_df['label'] = label_encoder.fit_transform(categories.values)

In [10]:
X = headlines.values
y = balanced_df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_df = pd.DataFrame({'headlines':X_train, 'labels':y_train})
test_df = pd.DataFrame({'headlines':X_test, 'labels':y_test})

train_df.to_csv('train_df.csv', index = False)
test_df.to_csv('test_df.csv', index = False)

data_files = {
'train':'train_df.csv',
'test':'test_df.csv'
}

datasets = load_dataset("csv", data_files=data_files)

Downloading and preparing dataset csv/default to C:/Users/levan/.cache/huggingface/datasets/csv/default-7b22cbae0e9605fe/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 1999.19it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 40.00it/s]
                                                                     

Dataset csv downloaded and prepared to C:/Users/levan/.cache/huggingface/datasets/csv/default-7b22cbae0e9605fe/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 32.78it/s]


In [11]:
datasets

DatasetDict({
    train: Dataset({
        features: ['headlines', 'labels'],
        num_rows: 148771
    })
    test: Dataset({
        features: ['headlines', 'labels'],
        num_rows: 37193
    })
})

## Define the model

In [12]:
class Model():
    def __init__(self, model_name, data_set, num_classes):
        self.model_name=model_name
        self.num_classes = num_classes
        self.model= BertForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_classes, return_dict=True).to(device)
        self.dataset=data_set


    def tokenize(self, token_args):
        self.tokenizer =  BertTokenizer.from_pretrained(self.model_name)
        self.tokenized_dataset = self.dataset.map(self.tokenizer, input_columns='headlines', fn_kwargs=token_args)
        self.tokenized_dataset.set_format('torch')

    def metric_fn(self, predictions):
        preds = predictions.predictions.argmax(axis=1)
        labels = predictions.label_ids
        return {'f1': f1_score(labels, preds, average='weighted')}


    def train(self, train_args):
        trainer = Trainer(
            model=self.model,
            args=train_args,
            train_dataset=self.tokenized_dataset['train'],
            eval_dataset=self.tokenized_dataset['test'],
            compute_metrics=self.metric_fn)

        trainer.train()


    def hyper_parameters_search(self, train_args):
        trainer = Trainer(
            model=self.model,
            args=train_args,
            train_dataset=self.tokenized_dataset['train'],
            eval_dataset=self.tokenized_dataset['test'],
            model_init=self.model_init,
            compute_metrics=self.metric_fn)

        best_run = trainer.hyperparameter_search(direction="maximize", hp_space=self.optuna_hp_space,n_trials=10)
        chosen_hyperparameters = best_run.hyperparameters
        wandb.finish()
        print(f'{self.model_name} chosen hyperparameters:')
        print(chosen_hyperparameters)


    def model_init(self):
        return self.model

    def optuna_hp_space(self,trial):
        return {"learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
                "num_train_epochs": trial.suggest_categorical("num_train_epochs", [7]),
                "seed": trial.suggest_categorical("seed", [9]),
                "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16]),
                "gradient_accumulation_steps":trial.suggest_int("gradient_accumulation_steps",1,5),
                "warmup_steps":trial.suggest_int("warmup_steps",1,300),
                "weight_decay":trial.suggest_float("weight_decay",1e-3,1e-1),
                "per_device_eval_batch_size":trial.suggest_categorical("per_device_eval_batch_size",[16])}


    def print_model(self):
        x = PrettyTable()
        x.field_names  = ["Layer Name", "Sum of Weights"]
        total_sum = 0
        for name, module in self.model.named_modules():
            if isinstance(module, torch.nn.Linear):
                x.add_row([name, round(module.weight.data.abs().sum().item(),3)])
                total_sum+=module.weight.data.abs().sum().item()
        x.add_row(['Total Sum of Weights', round(total_sum,3)])
        print(x)

    def print_model_size(self):
        torch.save(self.model.state_dict(), Path(home_dir,"tmp.pt"))
        print("%.2f MB" %(os.path.getsize(Path(home_dir,"tmp.pt"))/1e6))

    def pruning(self, amount):
        for name, module in self.model.named_modules():
            if isinstance(module,torch.nn.Linear):
                prune.l1_unstructured(module, name='weight', amount=amount)

    def quantize_model(self):
        self.model.qconfig = torch.quantization.default_qconfig
        self.model = torch.quantization.prepare(self.model)
        self.model = torch.quantization.convert(self.model)

    def evaluate(self, train_args):
        trainer = Trainer(
            model=self.model,
            args=train_args,
            train_dataset=self.tokenized_dataset['train'],
            eval_dataset=self.tokenized_dataset['test'],
            compute_metrics=self.metric_fn)

        predictions = trainer.predict(self.tokenized_dataset['test'])
        result_dict = self.metric_fn(predictions)
        for k,v in result_dict.items():
            print(f'{k} value: {v}')

    def kl(self,hidden_dim,num_layers,num_heads,epochs, alpha_teacher, lr, loss_func):
        embedding_matrix = self.model.bert.embeddings.word_embeddings.weight
        embedding_dim = embedding_matrix.size(1)
        vocab_size = embedding_matrix.size(0)
        Student = Student_Calssifier(vocab_size, embedding_dim, hidden_dim, num_layers, num_heads, self.num_classes)
        Student.embedding.weight.data.copy_(embedding_matrix)
        Teacher = copy.deepcopy(self.model).to(device)
        Student.to(device)
        data = self.tokenized_dataset['train']

        Student = kl_training(Student, Teacher, epochs, alpha_teacher, lr, loss_func, data)
        return Student

    def save_model(self, model_name):
        torch.save(self.model.state_dict(), f'{model_name}.pt')



# <u>Model 1 - BERT Base Uncased<u>
## Model 1  Definition and Tokenization:

In [13]:
pip install requests==2.27.1

Collecting requests==2.27.1
  Using cached requests-2.27.1-py2.py3-none-any.whl (63 kB)
Collecting urllib3<1.27,>=1.21.1 (from requests==2.27.1)
  Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)
                                              0.0/143.1 kB ? eta -:--:--
     --------                                30.7/143.1 kB 1.4 MB/s eta 0:00:01
     ------------------                    71.7/143.1 kB 653.6 kB/s eta 0:00:01
     ----------------------------         112.6/143.1 kB 819.2 kB/s eta 0:00:01
     ------------------------------------ 143.1/143.1 kB 851.7 kB/s eta 0:00:00
Collecting charset-normalizer~=2.0.0 (from requests==2.27.1)
  Using cached charset_normalizer-2.0.12-py3-none-any.whl (39 kB)
Installing collected packages: urllib3, charset-normalizer, requests
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.0.3
    Uninstalling urllib3-2.0.3:
      Successfully uninstalled urllib3-2.0.3
  Attempting uninstall: charset-normalizer
    Fou

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Python310\\Lib\\site-packages\\~harset_normalizer\\md.cp310-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [14]:
os.environ['CURL_CA_BUNDLE'] = ''

In [15]:
model_name_1 = "bert-base-uncased"
num_of_classes = 62
bert_model = Model(model_name_1, datasets, num_of_classes)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [16]:
token_args = {"max_length": 64, "truncation": True, "padding": "max_length"}
bert_model.tokenize(token_args)

                                                                     

## Model 1 Hyperparameter Search

In [None]:
wandb.init(project="DeepLearning")

train_args = TrainingArguments(output_dir=output_dir,
                             overwrite_output_dir=True,
                             greater_is_better=True,
                             evaluation_strategy='epoch',
                             do_train=True,
                             logging_strategy='epoch',
                             save_strategy='no',
                             report_to='wandb')

[34m[1mwandb[0m: Currently logged in as: [33mliyag[0m ([33mdelta_lxr[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
bert_model.hyper_parameters_search(train_args)

[32m[I 2023-06-12 23:56:45,096][0m A new study created in memory with name: no-name-5b015cbb-7d7d-4411-8219-748edfad20d2[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01718333333362049, max=1.0)…

Epoch,Training Loss,Validation Loss,F1
1,3.0128,2.324755,0.385335
2,2.1628,1.966571,0.473018
3,1.8741,1.819094,0.513033
4,1.7027,1.74053,0.531638
5,1.5823,1.689133,0.551586
6,1.5058,1.658887,0.560356
7,1.461,1.650897,0.56254


[32m[I 2023-06-13 01:19:22,168][0m Trial 0 finished with value: 0.5625395808513218 and parameters: {'learning_rate': 4.183323918010571e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'warmup_steps': 115, 'weight_decay': 0.08914088722040585, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 0.5625395808513218.[0m


0,1
eval/f1,▁▄▆▇███
eval/loss,█▄▃▂▁▁▁
eval/runtime,▁▇█████
eval/samples_per_second,█▂▁▁▁▁▁
eval/steps_per_second,█▂▁▁▁▁▁
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▄▃▂▂▁▁
train/total_flos,▁

0,1
eval/f1,0.56254
eval/loss,1.6509
eval/runtime,37.1272
eval/samples_per_second,1001.96
eval/steps_per_second,62.623
train/epoch,7.0
train/global_step,65100.0
train/learning_rate,0.0
train/loss,1.461
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,1.5271,1.546883,0.591352
2,1.1122,1.42408,0.635877
3,0.815,1.39703,0.658872
4,0.6003,1.433538,0.669496
5,0.4342,1.475755,0.67999
6,0.3179,1.537445,0.685465
7,0.243,1.563465,0.686887


[32m[I 2023-06-13 02:28:20,853][0m Trial 1 finished with value: 0.6868869998350791 and parameters: {'learning_rate': 2.6741215081649235e-05, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'warmup_steps': 109, 'weight_decay': 0.09151740119576889, 'per_device_eval_batch_size': 16}. Best is trial 1 with value: 0.6868869998350791.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁▄▆▇▇██
eval/loss,▇▂▁▃▄▇█
eval/runtime,▅█▁▆▁█▆
eval/samples_per_second,▄▁█▃█▁▃
eval/steps_per_second,▄▁█▃█▁▃
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▆▄▃▂▁▁
train/total_flos,▁

0,1
eval/f1,0.68689
eval/loss,1.56347
eval/runtime,37.4174
eval/samples_per_second,994.191
eval/steps_per_second,62.137
train/epoch,7.0
train/global_step,32550.0
train/learning_rate,0.0
train/loss,0.243
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,0.2045,1.660191,0.68383
2,0.1353,1.756216,0.683789
3,0.0849,1.877569,0.684486
4,0.0553,1.97617,0.684927
5,0.0368,2.073597,0.689748
6,0.0288,2.131151,0.689641
7,0.039,2.142794,0.688738


[32m[I 2023-06-13 03:29:23,919][0m Trial 2 finished with value: 0.6887378135934601 and parameters: {'learning_rate': 1.749735397672745e-05, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4, 'warmup_steps': 97, 'weight_decay': 0.016675364906410122, 'per_device_eval_batch_size': 16}. Best is trial 2 with value: 0.6887378135934601.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁▁▂▂██▇
eval/loss,▁▂▄▆▇██
eval/runtime,▂▃█▃▃▁▁
eval/samples_per_second,▇▆▁▆▆██
eval/steps_per_second,▇▆▁▆▆██
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▅▃▂▁▁▁
train/total_flos,▁

0,1
eval/f1,0.68874
eval/loss,2.14279
eval/runtime,37.2199
eval/samples_per_second,999.467
eval/steps_per_second,62.467
train/epoch,7.0
train/global_step,16275.0
train/learning_rate,0.0
train/loss,0.039
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,0.0083,2.179108,0.692447
2,0.0067,2.209583,0.690939
3,0.0045,2.240933,0.691621
4,0.0034,2.274277,0.691419
5,0.0028,2.306708,0.690071
6,0.0028,2.327006,0.69072
7,0.013,2.331071,0.690011


[32m[I 2023-06-13 04:33:05,149][0m Trial 3 finished with value: 0.6900108258517192 and parameters: {'learning_rate': 1.947733969457909e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 146, 'weight_decay': 0.08874974883831581, 'per_device_eval_batch_size': 16}. Best is trial 3 with value: 0.6900108258517192.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,█▄▆▅▁▃▁
eval/loss,▁▂▄▅▇██
eval/runtime,█▆▁▅▆█▁
eval/samples_per_second,▁▃█▄▃▁█
eval/steps_per_second,▁▃█▄▃▁█
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,▅▄▂▁▁▁█
train/total_flos,▁

0,1
eval/f1,0.69001
eval/loss,2.33107
eval/runtime,37.2921
eval/samples_per_second,997.53
eval/steps_per_second,62.346
train/epoch,7.0
train/global_step,21700.0
train/learning_rate,0.0
train/loss,0.013
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,0.0112,2.548951,0.68447
2,0.0128,2.616982,0.683974
3,0.0098,2.711909,0.68586
4,0.007,2.735688,0.691193
5,0.0051,2.815058,0.689085
6,0.0055,2.829425,0.68918
7,0.018,2.835057,0.689578


[32m[I 2023-06-13 05:34:08,352][0m Trial 4 finished with value: 0.6895778609936061 and parameters: {'learning_rate': 1.5942436644489087e-05, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4, 'warmup_steps': 131, 'weight_decay': 0.0747548214676366, 'per_device_eval_batch_size': 16}. Best is trial 3 with value: 0.6900108258517192.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁▁▃█▆▆▆
eval/loss,▁▃▅▆███
eval/runtime,█▆▅▆▁▂▇
eval/samples_per_second,▁▃▄▃█▇▂
eval/steps_per_second,▁▃▄▃█▇▂
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,▄▅▄▂▁▁█
train/total_flos,▁

0,1
eval/f1,0.68958
eval/loss,2.83506
eval/runtime,37.7768
eval/samples_per_second,984.732
eval/steps_per_second,61.546
train/epoch,7.0
train/global_step,16275.0
train/learning_rate,0.0
train/loss,0.018
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,0.0011,2.836236,0.694702
2,0.0005,2.860836,0.690769
3,0.0005,2.871124,0.692341
4,0.0005,2.887267,0.692364
5,0.0002,2.913437,0.690063
6,0.0002,2.922227,0.691025
7,0.0065,2.926505,0.690831


[32m[I 2023-06-13 06:37:47,665][0m Trial 5 finished with value: 0.6908305705677684 and parameters: {'learning_rate': 1.8325249831728925e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 181, 'weight_decay': 0.014021528948180751, 'per_device_eval_batch_size': 16}. Best is trial 5 with value: 0.6908305705677684.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,█▂▄▄▁▂▂
eval/loss,▁▃▄▅▇██
eval/runtime,█▁▃█▇▁▅
eval/samples_per_second,▁█▆▁▂█▄
eval/steps_per_second,▁█▆▁▂█▄
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,▂▁▁▁▁▁█
train/total_flos,▁

0,1
eval/f1,0.69083
eval/loss,2.92651
eval/runtime,37.6288
eval/samples_per_second,988.606
eval/steps_per_second,61.788
train/epoch,7.0
train/global_step,21700.0
train/learning_rate,0.0
train/loss,0.0065
train/total_flos,3.4275514226688e+16


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.017200000000108653, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,0.0801,3.163306,0.674801


[32m[I 2023-06-13 06:49:57,561][0m Trial 6 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁
train/learning_rate,▁
train/loss,▁

0,1
eval/f1,0.6748
eval/loss,3.16331
eval/runtime,38.0542
eval/samples_per_second,977.553
eval/steps_per_second,61.097
train/epoch,1.0
train/global_step,9300.0
train/learning_rate,2e-05
train/loss,0.0801


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01691666666592937, max=1.0)…

Epoch,Training Loss,Validation Loss,F1
1,0.048,3.053157,0.683086


[32m[I 2023-06-13 06:59:16,410][0m Trial 7 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁
train/learning_rate,▁
train/loss,▁

0,1
eval/f1,0.68309
eval/loss,3.05316
eval/runtime,37.8928
eval/samples_per_second,981.716
eval/steps_per_second,61.357
train/epoch,1.0
train/global_step,3100.0
train/learning_rate,1e-05
train/loss,0.048


Epoch,Training Loss,Validation Loss,F1
1,0.0076,3.086411,0.686275


[32m[I 2023-06-13 07:08:33,818][0m Trial 8 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁
train/learning_rate,▁
train/loss,▁

0,1
eval/f1,0.68628
eval/loss,3.08641
eval/runtime,37.5366
eval/samples_per_second,991.033
eval/steps_per_second,61.94
train/epoch,1.0
train/global_step,3100.0
train/learning_rate,1e-05
train/loss,0.0076


Epoch,Training Loss,Validation Loss,F1
1,0.0017,3.164068,0.683753


[32m[I 2023-06-13 07:17:28,779][0m Trial 9 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁
train/learning_rate,▁
train/loss,▁

0,1
eval/f1,0.68375
eval/loss,3.16407
eval/runtime,37.5049
eval/samples_per_second,991.869
eval/steps_per_second,61.992
train/epoch,1.0
train/global_step,2325.0
train/learning_rate,1e-05
train/loss,0.0017


bert-base-uncased chosen hyperparameters:
{'learning_rate': 1.8325249831728925e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 181, 'weight_decay': 0.014021528948180751, 'per_device_eval_batch_size': 16}


## Model 1 Train on best hyperparameters

In [None]:
best_args = TrainingArguments(output_dir=output_dir,
                         overwrite_output_dir=True,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         seed = 9,
                         learning_rate=1.8325249831728925e-06,
                         weight_decay=0.014021528948180751,
                         greater_is_better=True,
                         evaluation_strategy='epoch',
                         do_train=True,
                         num_train_epochs=15,
                         gradient_accumulation_steps=3,
                         logging_strategy='epoch',
                         warmup_steps=181,
                         report_to='wandb')

eval_args = TrainingArguments(output_dir=output_dir,
                         overwrite_output_dir=True,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                          seed = 9,
                         learning_rate=1.8325249831728925e-06,
                         weight_decay=0.014021528948180751,
                         greater_is_better=True,
                         evaluation_strategy='epoch',
                         do_train=False,
                         gradient_accumulation_steps=3,
                         logging_strategy='epoch',
                         warmup_steps=181,
                         report_to='wandb')

bert_model.train(best_args)
# save model's state dict
bert_model.save_model('bert_regular')



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
#bert_model.save_model('bert_regular')
torch.save(bert_model.model.state_dict(),"bert_regular.pt")

## Best model summary:

In [None]:
bert_model.model.load_state_dict(torch.load(Path(home_dir, 'bert_regular.pt')))
bert_model.evaluate(eval_args)
bert_model.print_model_size()
bert_model.print_model()

f1 value: 0.6869010703006206
438.19 MB
+----------------------------------------------+----------------+
|                  Layer Name                  | Sum of Weights |
+----------------------------------------------+----------------+
|  bert.encoder.layer.0.attention.self.query   |    18635.98    |
|   bert.encoder.layer.0.attention.self.key    |   18326.658    |
|  bert.encoder.layer.0.attention.self.value   |   12523.816    |
| bert.encoder.layer.0.attention.output.dense  |   12118.736    |
|   bert.encoder.layer.0.intermediate.dense    |    65533.0     |
|      bert.encoder.layer.0.output.dense       |   61711.406    |
|  bert.encoder.layer.1.attention.self.query   |   18492.051    |
|   bert.encoder.layer.1.attention.self.key    |   18417.031    |
|  bert.encoder.layer.1.attention.self.value   |   12342.176    |
| bert.encoder.layer.1.attention.output.dense  |   11883.429    |
|   bert.encoder.layer.1.intermediate.dense    |   68518.859    |
|      bert.encoder.layer.1.output.de

## a. Model 1 Pruning

In [None]:
pruned_bert = Model(model_name_1, datasets, num_of_classes)
pruned_bert.tokenize(token_args)

# Load the model's best parameters
pruned_bert.model.load_state_dict(torch.load(Path(home_dir, 'bert_regular.pt')))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [None]:
# Prune and train one epoch
pruned_bert.pruning(0.2)
train_one_epoch_args = TrainingArguments(output_dir=output_dir,
                             overwrite_output_dir=True,
                             greater_is_better=True,
                             evaluation_strategy='epoch',
                             do_train=True,
                             logging_strategy='epoch',
                             num_train_epochs=1,
                             save_strategy='no')

pruned_bert.train(train_one_epoch_args)
# save model's state dict
pruned_bert.save_model('bert_pruning')



Epoch,Training Loss,Validation Loss,F1
1,0.6558,2.059624,0.677904


In [None]:
# Pruned model summary
pruned_bert.evaluate(eval_args)
pruned_bert.print_model_size()
pruned_bert.print_model()

f1 value: 0.677904232049942
780.50 MB
+----------------------------------------------+----------------+
|                  Layer Name                  | Sum of Weights |
+----------------------------------------------+----------------+
|  bert.encoder.layer.0.attention.self.query   |   18088.371    |
|   bert.encoder.layer.0.attention.self.key    |    17797.49    |
|  bert.encoder.layer.0.attention.self.value   |   12151.385    |
| bert.encoder.layer.0.attention.output.dense  |   11763.169    |
|   bert.encoder.layer.0.intermediate.dense    |   63676.031    |
|      bert.encoder.layer.0.output.dense       |   59927.969    |
|  bert.encoder.layer.1.attention.self.query   |   17953.551    |
|   bert.encoder.layer.1.attention.self.key    |   17884.939    |
|  bert.encoder.layer.1.attention.self.value   |   11963.402    |
| bert.encoder.layer.1.attention.output.dense  |   11522.994    |
|   bert.encoder.layer.1.intermediate.dense    |   66579.664    |
|      bert.encoder.layer.1.output.den

## b. Model 1 Quantization

In [19]:
# Load the quantized model

quantization_bert = Model(model_name_1, datasets, num_of_classes)
quantization_bert.tokenize(token_args)
quantization_bert.model.load_state_dict(torch.load(Path(home_dir, 'bert_regular.pt'), map_location=torch.device(device)))

# Train the quantized model for additional epochs
train_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    num_train_epochs=5,  # Specify the desired number of additional epochs
    save_strategy='no'
)

quantization_bert.train(train_args)

# Save the trained quantized model's state dict
quantization_bert.save_model('quantized_model_trained.pt')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

  0%|          | 56/92985 [07:28<254:17:16,  9.85s/it]

KeyboardInterrupt: 

## c. Model 1 Knowledge-Distillation

## Help function for model distilation

In [None]:
class Student_Calssifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_heads, num_classes, dropout=0.1):
        super(Student_Calssifier, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_layers = nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim, dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, num_layers)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(1, 0, 2)  # Adjust shape for transformer
        transformer_output = self.transformer_encoder(embedded)
        transformer_output = transformer_output.permute(1, 0, 2)  # Adjust shape back to (batch_size, seq_length, embedding_dim)
        logits = self.fc(transformer_output[:, -1, :])  # Take the last hidden state
        return logits.squeeze()


In [None]:
def kl_training(Student, Teacher, epochs, alpha_teacher, lr, loss_func, data):
    optimizer = torch.optim.Adam(Student.parameters(), lr=lr)
    count = 0
    optimizer.zero_grad()
    losses = list()

    for epoch in range(epochs):
        for index, i in enumerate(data):
            loss_e = 0
            count += 1
            inputs = i['input_ids']
            y_index = i['labels']
            attention_mask = torch.tensor(i['attention_mask']).to(device)
            inputs = torch.tensor(inputs).to(device)
            # Calculate loss with respect to teacher logits
            teacher_logits = Teacher(input_ids=inputs.unsqueeze(-1), attention_mask=attention_mask.unsqueeze(-1))[0]
            outputs = Student(inputs.unsqueeze(-1))
            loss_teacher = loss_func(outputs, teacher_logits)

            # Calculate loss with respect to ground truth
            temp_array = np.zeros(num_classes)
            temp_array[y_index] = 1
            target = torch.from_numpy(temp_array).unsqueeze(0).to(device)
            loss_ground_truth = loss_func(outputs, target)

            loss = alpha_teacher * loss_teacher + (1 - alpha_teacher) * loss_ground_truth
            loss_e += loss.item()
            if count == 64:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                count = 0
            if index%1000 == 0:
                print(index)
        losses.append(loss_e)
        print(f'Epoch {epoch} was finished with train loss = {loss_e}')
    return Student

In [None]:
hidden_dim = 128
num_layers = 2
num_heads = 2
num_classes = 62
epochs = 5
alpha_teacher = 0.8
lr=0.001
loss_func = nn.MSELoss()


In [None]:
kl_bert = Model(model_name_1, datasets, num_of_classes)
kl_bert.tokenize(token_args)

# Load the model's best parameters
kl_bert.model.load_state_dict(torch.load(Path(home_dir, 'bert_regular.pt')))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [None]:
bert_student = kl_bert.kl(hidden_dim,num_layers,num_heads,epochs, alpha_teacher, lr, loss_func)

  attention_mask = torch.tensor(i['attention_mask']).to(device)
  inputs = torch.tensor(inputs).to(device)
  return F.mse_loss(input, target, reduction=self.reduction)


0


RuntimeError: Found dtype Double but expected Float

In [None]:
student_model = Model(model_name_1, datasets, num_of_classes)
student_model.tokenize(token_args)
student_model.model = bert_student

# save model's state dict
student_model.save_model('bert_student')

In [None]:
# Student model summary
student_model.evaluate(eval_args)
student_model.print_model_size()
student_model.print_model()

# <u>Model 2 - ROBERTA Base<u>
## Model 2 Definition and Tokenization:

In [None]:
model_name_2 = "roberta-base"
num_of_classes = 62
roberta = Model(model_name_2, datasets, num_of_classes)
token_args = {"max_length": 64, "truncation": True, "padding": "max_length"}
roberta.tokenize(token_args)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/148800 [00:00<?, ? examples/s]

Map:   0%|          | 0/37200 [00:00<?, ? examples/s]

## Model 2 Hyperparameter Search

In [None]:
wandb.init(project="DeepLearning")

train_args = TrainingArguments(output_dir=output_dir,
                             overwrite_output_dir=True,
                             greater_is_better=True,
                             evaluation_strategy='epoch',
                             do_train=True,
                             logging_strategy='epoch',
                             save_strategy='no',
                             report_to='wandb')



[34m[1mwandb[0m: Currently logged in as: [33mliyag[0m ([33mdelta_lxr[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
roberta.hyper_parameters_search(train_args)

[32m[I 2023-06-15 18:24:12,873][0m A new study created in memory with name: no-name-341e8ab4-bff7-482f-a97a-b3ee3a305697[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666672124, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,3.4943,2.952349,0.237611
2,2.8676,2.5995,0.314514
3,2.6246,2.439085,0.35411
4,2.4879,2.346071,0.36819
5,2.401,2.289408,0.385501
6,2.3519,2.260303,0.38999
7,2.3254,2.249121,0.392198


[32m[I 2023-06-15 19:33:30,864][0m Trial 0 finished with value: 0.392197507549085 and parameters: {'learning_rate': 1.0906450785722703e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'warmup_steps': 298, 'weight_decay': 0.02419408513272277, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 0.392197507549085.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁▄▆▇███
eval/loss,█▄▃▂▁▁▁
eval/runtime,▂▂▁▅█▅▁
eval/samples_per_second,▇▇█▄▁▄█
eval/steps_per_second,▇▇█▄▁▄█
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▄▃▂▁▁▁
train/total_flos,▁

0,1
eval/f1,0.3922
eval/loss,2.24912
eval/runtime,30.2551
eval/samples_per_second,1229.544
eval/steps_per_second,76.847
train/epoch,7.0
train/global_step,65100.0
train/learning_rate,0.0
train/loss,2.3254
train/total_flos,3.4275514226688e+16


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,2.277,2.171113,0.414297
2,2.1675,2.098306,0.436412
3,2.1002,2.053305,0.450292
4,2.0514,2.0186,0.454959
5,2.0131,1.991603,0.466067
6,1.9944,1.978873,0.467591
7,1.9846,1.972843,0.470023


[32m[I 2023-06-15 20:25:36,941][0m Trial 1 finished with value: 0.4700229900169208 and parameters: {'learning_rate': 1.7866264609909754e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 161, 'weight_decay': 0.05205396216297007, 'per_device_eval_batch_size': 16}. Best is trial 1 with value: 0.4700229900169208.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁▄▆▆███
eval/loss,█▅▄▃▂▁▁
eval/runtime,▁▅▆▅▅█▆
eval/samples_per_second,█▄▃▄▄▁▃
eval/steps_per_second,█▄▃▄▄▁▃
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▅▄▃▂▁▁
train/total_flos,▁

0,1
eval/f1,0.47002
eval/loss,1.97284
eval/runtime,30.4985
eval/samples_per_second,1219.732
eval/steps_per_second,76.233
train/epoch,7.0
train/global_step,21700.0
train/learning_rate,0.0
train/loss,1.9846
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,1.9363,1.867773,0.499651
2,1.7318,1.744479,0.537473
3,1.5881,1.664634,0.560429
4,1.4795,1.629033,0.569478
5,1.3945,1.58503,0.583015
6,1.3346,1.562527,0.590499
7,1.2974,1.553933,0.592215


[32m[I 2023-06-15 21:17:39,893][0m Trial 2 finished with value: 0.5922154691110901 and parameters: {'learning_rate': 8.42671234623641e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 42, 'weight_decay': 0.0964488402496755, 'per_device_eval_batch_size': 16}. Best is trial 2 with value: 0.5922154691110901.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁▄▆▆▇██
eval/loss,█▅▃▃▂▁▁
eval/runtime,▁▆▄▃█▅▄
eval/samples_per_second,█▃▅▆▁▄▅
eval/steps_per_second,█▃▅▆▁▄▅
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▆▄▃▂▁▁
train/total_flos,▁

0,1
eval/f1,0.59222
eval/loss,1.55393
eval/runtime,30.4683
eval/samples_per_second,1220.941
eval/steps_per_second,76.309
train/epoch,7.0
train/global_step,21700.0
train/learning_rate,0.0
train/loss,1.2974
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,1.4325,1.63676,0.576789
2,1.2101,1.541181,0.602562
3,0.9931,1.455965,0.638115
4,0.8131,1.465411,0.650149
5,0.6617,1.456136,0.663546
6,0.5428,1.464734,0.673369
7,0.4554,1.4795,0.675929


[32m[I 2023-06-15 22:08:20,439][0m Trial 3 finished with value: 0.6759294417959156 and parameters: {'learning_rate': 3.874103388061133e-05, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4, 'warmup_steps': 241, 'weight_decay': 0.03309433065761276, 'per_device_eval_batch_size': 16}. Best is trial 3 with value: 0.6759294417959156.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▁▃▅▆▇██
eval/loss,█▄▁▁▁▁▂
eval/runtime,▁▇▃▄▅▂█
eval/samples_per_second,█▂▆▅▄▇▁
eval/steps_per_second,█▂▆▅▄▇▁
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▆▅▄▂▂▁
train/total_flos,▁

0,1
eval/f1,0.67593
eval/loss,1.4795
eval/runtime,30.7146
eval/samples_per_second,1211.152
eval/steps_per_second,75.697
train/epoch,7.0
train/global_step,16275.0
train/learning_rate,0.0
train/loss,0.4554
train/total_flos,3.4275514226688e+16


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,0.3264,1.553888,0.678504
2,0.2464,1.640012,0.675519
3,0.1805,1.707828,0.677402
4,0.1355,1.781306,0.677316
5,0.1105,1.834053,0.682261
6,0.1061,1.863115,0.683826
7,0.1588,1.871103,0.682751


[32m[I 2023-06-15 22:59:09,386][0m Trial 4 finished with value: 0.6827507978534815 and parameters: {'learning_rate': 1.2388692006618435e-05, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4, 'warmup_steps': 253, 'weight_decay': 0.06542097187746301, 'per_device_eval_batch_size': 16}. Best is trial 4 with value: 0.6827507978534815.[0m


0,1
eval/f1,▄▁▃▃▇█▇
eval/loss,▁▃▄▆▇██
eval/runtime,▆▁█▅▅▅▁
eval/samples_per_second,▃█▁▄▄▄█
eval/steps_per_second,▃█▁▄▄▄█
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,█▅▃▂▁▁▃
train/total_flos,▁

0,1
eval/f1,0.68275
eval/loss,1.8711
eval/runtime,30.3733
eval/samples_per_second,1224.762
eval/steps_per_second,76.548
train/epoch,7.0
train/global_step,16275.0
train/learning_rate,0.0
train/loss,0.1588
train/total_flos,3.4275514226688e+16


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666656966, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,0.0853,2.09757,0.671165
2,0.0767,2.144343,0.675061
3,0.061,2.258623,0.673073
4,0.0473,2.325688,0.678121
5,0.041,2.364275,0.680753
6,0.0453,2.390846,0.680737
7,0.1052,2.37534,0.682636


[32m[I 2023-06-15 23:49:10,172][0m Trial 5 finished with value: 0.6826355819059462 and parameters: {'learning_rate': 2.080479240207874e-05, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 5, 'warmup_steps': 199, 'weight_decay': 0.01846062685291469, 'per_device_eval_batch_size': 16}. Best is trial 4 with value: 0.6827507978534815.[0m


0,1
eval/f1,▁▃▂▅▇▇█
eval/loss,▁▂▅▆▇██
eval/runtime,██▇▆▄▅▁
eval/samples_per_second,▁▁▂▃▅▄█
eval/steps_per_second,▁▁▂▃▅▄█
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,▆▅▃▂▁▁█
train/total_flos,▁

0,1
eval/f1,0.68264
eval/loss,2.37534
eval/runtime,30.3368
eval/samples_per_second,1226.233
eval/steps_per_second,76.64
train/epoch,7.0
train/global_step,13020.0
train/learning_rate,0.0
train/loss,0.1052
train/total_flos,3.4275514226688e+16


Epoch,Training Loss,Validation Loss,F1
1,0.0087,2.445317,0.684123
2,0.0061,2.508686,0.685239
3,0.0038,2.544679,0.683121
4,0.0032,2.577409,0.68615
5,0.0027,2.605425,0.684594
6,0.0037,2.626765,0.684006
7,0.0468,2.619718,0.683562


[32m[I 2023-06-16 00:39:03,685][0m Trial 6 finished with value: 0.6835624435402132 and parameters: {'learning_rate': 3.3600626494738905e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 5, 'warmup_steps': 21, 'weight_decay': 0.048934306251516936, 'per_device_eval_batch_size': 16}. Best is trial 6 with value: 0.6835624435402132.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▃▆▁█▄▃▂
eval/loss,▁▃▅▆▇██
eval/runtime,█▆▅▇▇█▁
eval/samples_per_second,▁▃▄▂▂▁█
eval/steps_per_second,▁▃▄▂▂▁█
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▄▃▂▁
train/loss,▂▂▁▁▁▁█
train/total_flos,▁

0,1
eval/f1,0.68356
eval/loss,2.61972
eval/runtime,30.3474
eval/samples_per_second,1225.805
eval/steps_per_second,76.613
train/epoch,7.0
train/global_step,13020.0
train/learning_rate,0.0
train/loss,0.0468
train/total_flos,3.4275514226688e+16


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333327028, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,0.0016,2.767132,0.683031
2,0.0017,2.851578,0.682564
3,0.0012,2.916034,0.68046
4,0.001,2.954668,0.683656
5,0.0008,3.0054,0.682499
6,0.0018,3.017778,0.682107
7,0.046,3.006298,0.681568


[32m[I 2023-06-16 01:35:53,713][0m Trial 7 finished with value: 0.681568347118281 and parameters: {'learning_rate': 6.157357545353556e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4, 'warmup_steps': 202, 'weight_decay': 0.054449053387397736, 'per_device_eval_batch_size': 16}. Best is trial 6 with value: 0.6835624435402132.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,▇▆▁█▅▅▃
eval/loss,▁▃▅▆███
eval/runtime,▁▁███▇▇
eval/samples_per_second,██▁▁▁▂▂
eval/steps_per_second,██▁▁▁▂▂
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,▁▁▁▁▁▁█
train/total_flos,▁

0,1
eval/f1,0.68157
eval/loss,3.0063
eval/runtime,36.6567
eval/samples_per_second,1014.82
eval/steps_per_second,63.426
train/epoch,7.0
train/global_step,16275.0
train/learning_rate,0.0
train/loss,0.046
train/total_flos,3.4275514226688e+16


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

Epoch,Training Loss,Validation Loss,F1
1,0.0005,3.078765,0.685397
2,0.0004,3.131454,0.683675
3,0.0002,3.140082,0.685281
4,0.0002,3.173237,0.684737
5,0.0002,3.202909,0.684679
6,0.0003,3.209505,0.682769
7,0.0383,3.205823,0.682474


[32m[I 2023-06-16 02:39:40,225][0m Trial 8 finished with value: 0.6824744700781978 and parameters: {'learning_rate': 3.0654403316623304e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 127, 'weight_decay': 0.08548994314004767, 'per_device_eval_batch_size': 16}. Best is trial 6 with value: 0.6835624435402132.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1,█▄█▆▆▂▁
eval/loss,▁▄▄▆███
eval/runtime,█▆▁▄▆▇▁
eval/samples_per_second,▁▃█▅▃▂█
eval/steps_per_second,▁▃█▅▃▂█
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,▁▁▁▁▁▁█
train/total_flos,▁

0,1
eval/f1,0.68247
eval/loss,3.20582
eval/runtime,36.725
eval/samples_per_second,1012.935
eval/steps_per_second,63.308
train/epoch,7.0
train/global_step,21700.0
train/learning_rate,0.0
train/loss,0.0383
train/total_flos,3.4275514226688e+16


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

Epoch,Training Loss,Validation Loss,F1
1,0.0001,3.257234,0.686035
2,0.0001,3.297982,0.686063
3,0.0,3.319265,0.685849
4,0.0,3.345392,0.685508
5,0.0,3.381916,0.684831
6,0.0001,3.388909,0.683458
7,0.034,3.381931,0.682824


[32m[I 2023-06-16 03:49:02,896][0m Trial 9 finished with value: 0.6828236101795916 and parameters: {'learning_rate': 1.093274712731568e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 2, 'warmup_steps': 140, 'weight_decay': 0.07056041638210678, 'per_device_eval_batch_size': 16}. Best is trial 6 with value: 0.6835624435402132.[0m


0,1
eval/f1,███▇▅▂▁
eval/loss,▁▃▄▆███
eval/runtime,█▃▇▂▆▁▇
eval/samples_per_second,▁▆▂▇▃█▂
eval/steps_per_second,▁▆▂▇▃█▂
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███
train/global_step,▁▁▂▂▃▃▅▅▆▆▇▇███
train/learning_rate,█▇▆▅▃▂▁
train/loss,▁▁▁▁▁▁█
train/total_flos,▁

0,1
eval/f1,0.68282
eval/loss,3.38193
eval/runtime,37.159
eval/samples_per_second,1001.104
eval/steps_per_second,62.569
train/epoch,7.0
train/global_step,32550.0
train/learning_rate,0.0
train/loss,0.034
train/total_flos,3.4275514226688e+16


roberta-base chosen hyperparameters:
{'learning_rate': 3.3600626494738905e-06, 'num_train_epochs': 7, 'seed': 9, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 5, 'warmup_steps': 21, 'weight_decay': 0.048934306251516936, 'per_device_eval_batch_size': 16}


## Model 2 Train on best hyperparameters

In [None]:
best_args = TrainingArguments(output_dir=output_dir,
                         overwrite_output_dir=True,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         seed = 9,
                         learning_rate=7.600882468235104e-06,
                         weight_decay=0.0511022637864972,
                         greater_is_better=True,
                         evaluation_strategy='epoch',
                         do_train=True,
                         num_train_epochs=20,
                         gradient_accumulation_steps=1,
                         logging_strategy='epoch',
                         warmup_steps=188,
                         report_to='wandb')

eval_args = TrainingArguments(output_dir=output_dir,
                         overwrite_output_dir=True,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                          seed = 9,
                         learning_rate=7.600882468235104e-06,
                         weight_decay=0.0511022637864972,
                         greater_is_better=True,
                         evaluation_strategy='epoch',
                         do_train=False,
                         gradient_accumulation_steps=1,
                         logging_strategy='epoch',
                         warmup_steps=188,
                         report_to='wandb')

roberta.train(best_args)

# save model's state dict
roberta.save_model('roberta_regular')



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Epoch,Training Loss,Validation Loss,F1
1,0.0117,3.663433,0.67686
2,0.0212,3.76952,0.674983
3,0.0283,3.862758,0.673112
4,0.0359,3.957393,0.676091
5,0.0494,4.000326,0.676022
6,0.072,3.980752,0.673859
7,0.1848,3.727416,0.674466
8,0.282,3.490899,0.676872


RuntimeError: [enforce fail at C:\cb\pytorch_1000000000000\work\caffe2\serialize\inline_container.cc:337] . unexpected pos 860796096 vs 860795984

## Best model summary:

In [None]:
roberta.model.load_state_dict(torch.load(Path(home_dir, 'best_roberta.py')))
roberta.evaluate(eval_args)
roberta.print_model_size()
roberta.print_model()

## a. Model 2 Pruning

In [None]:
pruned_roberta = Model(model_name_2, datasets, num_of_classes)
pruned_roberta.tokenize(token_args)

# Load the model's best parameters
pruned_roberta.model.load_state_dict(torch.load(Path(home_dir, 'roberta_regular.py')))

In [None]:
# Prune and train one epoch
pruned_bert.pruning(0.2)
train_one_epoch_args = TrainingArguments(output_dir=output_dir,
                             overwrite_output_dir=True,
                             greater_is_better=True,
                             evaluation_strategy='epoch',
                             do_train=True,
                             logging_strategy='epoch',
                             num_train_epochs=1,
                             save_strategy='no')

pruned_roberta.train(train_one_epoch_args)
pruned_roberta.save_model('roberta_pruning')

In [None]:
# Pruned model summary
pruned_roberta.evaluate(eval_args)
pruned_roberta.print_model_size()
pruned_roberta.print_model()

##b.Model 2 Quantization

In [None]:
# Load the quantized model

quantization_bert = Model(model_name_1, datasets, num_of_classes)
quantization_bert.tokenize(token_args)
quantization_bert.model.load_state_dict(torch.load(Path(home_dir, 'roberta_regular.pt')))

# Train the quantized model for additional epochs
train_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    num_train_epochs=5,  # Specify the desired number of additional epochs
    save_strategy='no'
)

quantization_bert.train(train_args)

# Save the trained quantized model's state dict
quantization_bert.save_model('quantized_reberta.pt')

## c. Model 2 Knowledge-Distillation

In [None]:
hidden_dim = 128
num_layers = 2
num_heads = 2
num_classes = 62
epochs = 1
alpha_teacher = 0.8
lr=0.001
loss_func = nn.MSELoss()

In [None]:
kl_roberta = Model(model_name_2, datasets, num_of_classes)
kl_roberta.tokenize(token_args)

# Load the model's best parameters
kl_roberta.model.load_state_dict(torch.load(Path(home_dir, 'best_roberta.py')))

In [None]:
roberta_student = kl_roberta.kl(hidden_dim,num_layers,num_heads,epochs, alpha_teacher, lr, loss_func)

In [None]:
roberta_student_model = Model(model_name_1, datasets, num_of_classes)
roberta_student_model.tokenize(token_args)
roberta_student_model.model = roberta_student

roberta_student_model.save_model('roberta_student')

In [None]:
# Student model summary
roberta_student_model.evaluate(eval_args)
roberta_student_model.print_model_size()
roberta_student_model.print_model()