In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn import model_selection, metrics
import transformers
import torch

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
"""
Imp steps:
* Create Dataset class
* Build Model
* Trainer - training arguments
"""

'\nImp steps:\n* Create Dataset class\n* Build Model\n* Trainer - training arguments\n'

In [5]:
config = {
    "max_length": 360,
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    
    "output_dir": "./my-model",
    "train_batch_size": 64,
    "valid_batch_size": 64,
    "learning_rate": 3e-5,
    "epochs": 3,
    
    "debug": False,
}

In [6]:
class TextDataset:
    
    def __init__(self,data):
        self.data = data
        
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self,idx):
        row = self.data.iloc[idx]
        
        enc = tokenizer(
            row["text"],
            max_length = config["max_length"],
            truncation = True,
            padding = "max_length"
        )
        
        return {
            "input_ids": torch.tensor(enc["input_ids"]),
            "attention_mask": torch.tensor(enc["attention_mask"]),
            "label": torch.tensor(row["label"])
        }

In [7]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv").rename(columns = {"review":"text"})

id2label = {0: "negative", 1: "positive"}
label2id = {label: id_ for id_, label in id2label.items()}

df["label"] = df["sentiment"].map(label2id)

if config["debug"]:
    print("DEBUG MODE!")
    df = df.sample(10_000, random_state=123)

print(df.shape)
df.head()

(50000, 3)


Unnamed: 0,text,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(config["model_path"])

config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [9]:
train, valid = model_selection.train_test_split(
    df,
    test_size=0.2,
    random_state=1123,
    shuffle=True,
    stratify=df["label"]
)

In [10]:
train_ds = TextDataset(train)
valid_ds = TextDataset(valid)

In [11]:
valid_ds[0]

{'input_ids': tensor([  101,  1045,  2066,  4393,  5691,  1010,  1045,  2066,  1038,  1011,
          5691,  1010,  1045,  2293,  1038,  4393,  5691,  1012,  2021,  2023,
          2028,  2038,  3053,  2498,  2183,  2005,  2009,  1012,  2070,  1997,
          1996,  3772,  2003,  9202,  1010,  2926,  2011,  1017,  1997,  1996,
          3287,  5260,  1012,  1996,  2466,  2003,  2025,  3327,  5875,  1012,
          2012,  1037,  5816,  2460,  6070,  2781,  2009,  2145,  3849,  2205,
          2146,  1998,  2017,  1005,  2222,  2424,  4426,  3435,  1011,  2830,
          2075,  3243,  1037,  2978,  1012,  2045,  2024,  2019,  9643,  2843,
          1997, 18577,  1011, 11865,  4393,  4491,  1012,  2614,  4658,  1029,
          2009,  3475,  1005,  1056,  2043,  2009,  1005,  1055,  2589,  2006,
          1037,  2659,  5166,  1012,  2009,  4152, 23563,  2200,  2855,  1012,
          2045,  2003,  2070,  3576,  2668,  1998, 13638,  1010,  2498,  2000,
          2131,  7568,  2055,  1012,  2

In [12]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(config["model_path"])

pytorch_model.bin:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def compute_metrics(eval_data):
    logits, labels = eval_data
    
    preds = np.argmax(logits, -1)
    
    return {
        "f1": metrics.f1_score(labels, preds)
    }

In [14]:
training_args = transformers.TrainingArguments(
    output_dir=config["output_dir"],
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["valid_batch_size"],
    learning_rate=config["learning_rate"],
    num_train_epochs=config["epochs"],
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

In [15]:
trainer = transformers.Trainer(
    model=model,  
    args=training_args, 
    train_dataset=train_ds, 
    eval_dataset=valid_ds,  
    tokenizer=tokenizer, 
    compute_metrics=compute_metrics,  
)

2024-09-12 11:35:15.685755: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-12 11:35:15.685876: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-12 11:35:15.809466: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.3,0.274919,0.89686
2,0.2241,0.242634,0.907866
3,0.2036,0.233216,0.911299


TrainOutput(global_step=939, training_loss=0.28235447597198976, metrics={'train_runtime': 675.8822, 'train_samples_per_second': 177.546, 'train_steps_per_second': 1.389, 'total_flos': 1245553977600000.0, 'train_loss': 0.28235447597198976, 'epoch': 3.0})

In [17]:
trainer.save_state()

In [18]:
trainer.save_model()