In [None]:
# Loading Libraries

In [None]:
!pip install datasets -q
!pip install transformers --upgrade
!pip install accelerate>=0.20.1 -q
!pip install nlpaug --quiet

In [None]:
import torch
import datasets
from datasets import load_dataset,Dataset
import transformers
from transformers import TrainingArguments
import warnings
import pandas as pd
warnings.filterwarnings("ignore")

In [None]:
import nlpaug.augmenter.word as naw
import nlpaug.flow as nafc
from nlpaug.util import Action

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
ds=load_dataset("carblacac/twitter-sentiment-analysis")

In [None]:
ds=ds.rename_column('feeling','label')
ds

In [None]:
from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_glove(dest_dir = '.', model_name = 'glove.6B')

aug = naw.WordEmbsAug(
  model_type = 'glove',
  model_path = 'glove.6B.300d.txt',
  action = "substitute")

In [None]:
from tqdm import tqdm
def augment_data(aug_strategy,n,train_df):
    augmented_tweets = []
    augmented_tweets_labels = []
    text=train_df['text'].values
    label=train_df['label'].values

    if aug_strategy == 'glove':
        for i in tqdm(train_df.index,desc="completed"):
            lst_augment=aug.augment(text[i], n = n)
            for augment in lst_augment:
                augmented_tweets.append(augment)
                augmented_tweets_labels.append(label[i])


    augmented_data = list(zip(augmented_tweets, augmented_tweets_labels))

    df_augmented_data = pd.DataFrame(augmented_data, columns = ['text', 'label'])
    train_df_augmented = pd.concat([train_df, df_augmented_data], axis = 0)

    return train_df_augmented

In [None]:
train_df=ds['train']
train_df.set_format(type='pandas')
train_df=train_df[:500]
print(train_df.head())

In [None]:
train_df_augmented=augment_data(aug_strategy='glove',n=2,train_df=train_df)
len(train_df),len(train_df_augmented)

In [None]:
train_augmented = Dataset.from_pandas(train_df_augmented)

# Tokenizing Dataset

In [None]:
from transformers import AutoTokenizer

In [None]:
student_name='huawei-noah/TinyBERT_General_4L_312D'
student_tokenizer=AutoTokenizer.from_pretrained(student_name)

In [None]:
student_tokenizer.model_input_names

In [None]:
def tokenize_text(batch):
    return student_tokenizer(batch['text'],truncation=True)

In [None]:
train_tokenized=train_augmented.map(tokenize_text,batched=True)
valid_tokenized=ds['validation'].map(tokenize_text,batched=True)
small_valid= valid_tokenized.shuffle().select(range(200))

In [None]:
train_tokenized=train_tokenized.remove_columns(["__index_level_0__"])
train_tokenized

# Initializing Student Model

In [None]:
import torch
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification

student_name='huawei-noah/TinyBERT_General_4L_312D'

num_labels = 2
student_config = (AutoConfig
                  .from_pretrained(student_name, num_labels=2))

In [None]:
def student_init():
  return (AutoModelForSequenceClassification.from_pretrained(student_name, config=student_config).to(device))

# Initializing Teacher Model

In [None]:
teacher_name='/content/drive/MyDrive/Colab_Notebooks/Knowledge_Disillation/save_bert/'

In [None]:
teacher_model = (AutoModelForSequenceClassification
                     .from_pretrained(teacher_name, num_labels=2)
                     .to(device))

# Defining Loss Function

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import Trainer

In [None]:
class KnowledgeDistillationTrainer(Trainer):
  def __init__(self, *args, teacher_model=None, **kwargs):
    super().__init__(*args, **kwargs)
    self.teacher_model = teacher_model

  def compute_loss(self, model, inputs, return_outputs=False):
    outputs_student = model(**inputs)
    loss_ce = outputs_student.loss
    logits_student = outputs_student.logits

    outputs_teacher = self.teacher_model(**inputs)
    logits_teacher = outputs_teacher.logits

    loss_fct = nn.KLDivLoss(reduction="batchmean")
    loss_kd = self.args.temperature ** 2 * loss_fct(
                F.log_softmax(logits_student / self.args.temperature, dim=-1),
                F.softmax(logits_teacher / self.args.temperature, dim=-1))

    # Return weighted student loss
    loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
    return (loss, outputs_student) if return_outputs else loss

# Training Model

In [None]:
import numpy as np
from datasets import load_metric
accuracy_score = load_metric("accuracy",trust_remote_code=True)

def compute_metrics(pred):
  predictions, labels = pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy_score.compute(predictions=predictions, references=labels)

In [None]:
class KnowledgeDistillationTrainingArguments(TrainingArguments):
  def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
    super().__init__(*args, **kwargs)
    self.alpha = alpha
    self.temperature = temperature

In [None]:
batch_size=32
finetuned_student_ckpt=f"/content/drive/MyDrive/Colab_Notebooks/Knowledge_Disillation/tinybert-finetuned-sentiment/"

student_training_args = KnowledgeDistillationTrainingArguments(output_dir=finetuned_student_ckpt,
                                                               evaluation_strategy = "epoch",
                                                               num_train_epochs=3,
                                                               learning_rate=2e-5,
                                                               per_device_train_batch_size=batch_size,
                                                               per_device_eval_batch_size=batch_size,
                                                               alpha=1,
                                                               weight_decay=0.01,
                                                               disable_tqdm=False,
                                                               logging_steps=len(train_tokenized)//batch_size,
                                                               log_level='error',

                                                               )

In [None]:
tinybert_trainer = KnowledgeDistillationTrainer(model_init=student_init,
        teacher_model=teacher_model, args=student_training_args,
        train_dataset=train_tokenized,
        eval_dataset=small_valid,
        compute_metrics=compute_metrics, tokenizer=student_tokenizer,)

In [None]:
tinybert_trainer.train()

# Save Model

In [None]:
tinybert_trainer.save_model('/content/drive/MyDrive/Colab_Notebooks/Knowledge_Disillation/save_tinybert/')