In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
checkpoint = "google-bert/bert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
tokenizer("I Love you", truncation = True)

In [None]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("glue", "mrpc")

In [None]:
raw_datasets

In [None]:
raw_datasets['train']

In [None]:
raw_datasets['train'][0]

In [None]:
def tokenizer_set(sentence):
    return tokenizer(sentence["sentence1"], sentence["sentence2"], truncation = True)

In [None]:
tokenizer_set(raw_datasets['train'][0])

In [None]:
tokenized_datasets = raw_datasets.map(tokenizer_set,batched=True)

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
datacollator = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments("ouput_model")

In [None]:
tokenized_datasets

In [None]:
model

In [None]:
trainer = Trainer(model,args = training_args,train_dataset=tokenized_datasets['train'],eval_dataset=tokenized_datasets['validation'],
                 data_collator = datacollator)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("hugging_face_token")
secret_value_1 = user_secrets.get_secret("wandb")

In [None]:
import wandb

In [None]:
wandb.login(key=secret_value_1)


In [None]:
wandb.init(name="run1",project='PractiseNLP')

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets['validation'])

In [None]:
predictions.predictions[:3]

In [None]:
import numpy as np
preds = np.argmax(predictions.predictions,axis=-1)

In [None]:
preds.shape

In [None]:
predictions.label_ids.shape

In [None]:
!pip install evaluate

In [None]:
import evaluate

In [None]:
evaluate.load("glue", "mrpc")

In [None]:
metric = evaluate.load("glue", "mrpc")
results = metric.compute(predictions =preds,references = predictions.label_ids )

In [None]:
results

Training Manually without the Trainer Class. 

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1","sentence2","idx"])

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets = tokenized_datasets.rename_column("label","labels")

In [None]:
tokenized_datasets.set_format("torch")

In [None]:
from torch.utils.data import DataLoader
train_dl = DataLoader(tokenized_datasets["train"],shuffle=True,batch_size = 8,collate_fn = datacollator)

In [None]:
iter_data = iter(train_dl)

In [None]:
next(iter_data).input_ids.shape

In [None]:
# Set the training Arguments

number_epochs = 3

batch_size = 8 

In [None]:
import math
steps_per_epoch = math.ceil(tokenized_datasets["train"].num_rows/batch_size)
total_steps = steps_per_epoch*number_epochs
print(total_steps,len(train_dl)*number_epochs)

In [None]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr = 5e-5)

In [None]:
# Build the learning schedule

from transformers import get_scheduler

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps,
)

In [None]:
#Define the model
from transformers import AutoModelForSequenceClassification
model_new = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model_new.to(device)


In [None]:
lrs = []
for step in range(total_steps):
    lr_scheduler.step()
    lrs.append(lr_scheduler.get_last_lr()[0]) 

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(lrs)

In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
train_dl

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(total_steps))
loss_range = []
for epoch in range(number_epochs):
    #print("Starting Epoch:",epoch)
    total_loss = 0
    for data in train_dl:
        optimizer.zero_grad()
        data = {k:v.to(device) for k,v in data.items()}
        loss = model_new(**data).loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
        progress_bar.update(1)
    loss_range.append(total_loss/)
        

In [None]:
[i/total_steps for i in loss_range]


In [None]:
from torch.utils.data import DataLoader
eval_dataloader = DataLoader(tokenized_datasets["validation"],shuffle=True,batch_size = 8,collate_fn = datacollator)

In [None]:
from accelerate import Accelerator
from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification, get_scheduler

accelerator = Accelerator()

model_new = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model_new.parameters(), lr=3e-5)

training_dl, eval_dl, model_new, optimizer = accelerator.prepare(
    train_dl, eval_dataloader, model_new, optimizer
)

In [None]:
next(iter(training_dl))

In [None]:
from tqdm import tqdm

In [None]:
progress_bar = tqdm(range(total_steps))
for epoch in range(number_epochs):
    for data in training_dl:
        output = model_new(**data)
        loss = output.loss
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        

In [None]:
iter()

In [None]:
next(iter(eval_dataloader))

In [None]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
predictions_new = model_new(**next(iter(eval_dataloader)).to(device))

In [None]:
logits = predictions_new['logits']

In [None]:
logits.argmax(dim = 1)