<a href="https://colab.research.google.com/github/peravali810/FinetuningLLMs/blob/main/Finetuning_DistilBERT_TinyBERT_MobileBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers accelerate datasets bertviz umap-learn seaborn openpyxl evaluate

import warnings
warnings.filterwarnings('ignore')

#Loading dataset

In [None]:
import pandas as pd

df = pd.read_excel("https://github.com/laxmimerit/All-CSV-ML-Data-Files-Download/raw/master/fake_news.xlsx")

In [None]:

df = df.dropna()

df.isnull().sum()

In [None]:
df.shape

In [None]:
df['label'].value_counts()

#Dataset Analyis

In [None]:
import matplotlib.pyplot as plt

labels = df['label'].value_counts(ascending=True)
labels.plot.barh()
plt.show()

In [None]:
df['title_tokens'] = df["title"].apply(lambda x: len(x.split())*1.5)
df['text_tokens'] = df['text'].apply(lambda x: len(x.split())*1.5)


fig, ax = plt.subplots(1,2, figsize=(15,5))
ax[0].hist(df['title_tokens'], bins = 50, color= 'skyblue')
ax[0].set_title('Title Tokens')

ax[1].hist(df['text_tokens'], bins = 50, color= 'red')
ax[1].set_title('Text Tokens')

In [None]:
df

#Creating DataLoader and Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 0.3, stratify = df['label'])
test, validation = train_test_split(test, test_size=1/3, stratify = test['label'])

train.shape, test.shape, validation.shape, df.shape

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict(
    {
        'train' : Dataset.from_pandas(train, preserve_index=False),
        'test' : Dataset.from_pandas(test, preserve_index=False),
        'validation' : Dataset.from_pandas(validation, preserve_index=False)
    }

)

dataset

#Creating data tokens(Tokenization)

In [None]:
from transformers import AutoTokenizer

In [None]:
text = "Hi, how are you ?"

model_ckpt = 'distilbert-base-uncased'
dtokenizer = AutoTokenizer.from_pretrained(model_ckpt)
dtokens = dtokenizer.tokenize(text)

model_ckpt = 'google/mobilebert-uncased'
mtokenizer = AutoTokenizer.from_pretrained(model_ckpt)
mtokens = mtokenizer.tokenize(text)

model_ckpt = 'Intel/dynamic_tinybert'
ttokenizer = AutoTokenizer.from_pretrained(model_ckpt)
ttokens = ttokenizer.tokenize(text)

In [None]:
def tokenize(batch):

    temp = dtokenizer(batch['title'], padding = True, truncation = True)
    return temp

print(tokenize(dataset['train'][:2]))

In [None]:
enDataset = dataset.map(tokenize, batched=True, batch_size=None)

#Building model

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig
import torch

id2label = {0:'Real', 1:'Fake'}
label2id = {'Real':0, 'Fake':1}

model_ckpt = 'distilbert-base-uncased'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = AutoConfig.from_pretrained(model_ckpt, id2label=id2label, label2id=label2id)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)



In [None]:
model.config

In [None]:
from transformers import TrainingArguments

batch_size = 64
training_dir = "bert_base_train_dir"

training_args = TrainingArguments(
    output_dir=training_dir,
    overwrite_output_dir=True,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    report_to="none"

)

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def eval_compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)

  return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = eval_compute_metrics,
    train_dataset = enDataset['train'],
    eval_dataset = enDataset['validation'],
    tokenizer = dtokenizer
)

In [None]:
trainer.train()

#Model Evaluation

In [None]:
OutPreds =  trainer.predict(enDataset['test'])
OutPreds.metrics

In [None]:
y_pred = np.argmax(OutPreds.predictions, axis=1)
y_true = enDataset['test'][:]['label']

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=list(label2id)))

#Benchmarking all models

In [None]:

modelDict = {
    'distilbert' : 'distilbert-base-uncased',
    'mobilebert' : 'google/mobilebert-uncased',
    'tinybert' : 'Intel/dynamic_tinybert'
}

def train(model):
  model_ckpt = modelDict[model]
  tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
  localmodel = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

  def localtokenize(batch):

    temp = tokenizer(batch['title'], padding = True, truncation = True)
    return temp

  enDataset = dataset.map(localtokenize, batched=True, batch_size=None)


  trainer = Trainer(
    model = localmodel,
    args = training_args,
    compute_metrics = eval_compute_metrics,
    train_dataset = enDataset['train'],
    eval_dataset = enDataset['validation'],
    tokenizer = tokenizer
  )

  trainer.train()
  OutPreds =  trainer.predict(enDataset['test'])
  print(f"Training completed: {model}")
  return OutPreds.metrics


performance = {}
for model in modelDict:
  print(f"Training model: {model}")
  temp = train(model)
  performance[model] = temp

In [None]:
performance