In [2]:
%load_ext autoreload
%autoreload 2


import pandas as pd

import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification,TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel
from datasets import load_dataset

from sklearn.metrics import accuracy_score, precision_recall_fscore_support


import warnings
from torchinfo import summary

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


### Text classification

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "huawei-noah/TinyBERT_General_4L_312D"
data_path = "imdb"
max_length = 1024  # You can adjust this based on your needs and model specifications

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label={0:"NEG", 1:"POS"}, label2id={"NEG":0, "POS":1})


train_dataset = load_dataset(data_path,split="train[:10000]+test[:1000]")
val_dataset = load_dataset(data_path,split="train[20000:22000]+test[3000:4000]")
test_dataset = load_dataset(data_path,split="train[22000:24000]+test[5000:6000]")

def tokenize_function(examples):
    tokenized = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=max_length)
    tokenized['labels'] = examples['label']  # Make sure 'label' is the correct column name in your dataset
    return tokenized
    
enc_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
enc_val = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)

enc_test = test_dataset.map(tokenize_function, batched=True, remove_columns=test_dataset.column_names)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 11000/11000 [00:04<00:00, 2415.95 examples/s]
Map: 100%|██████████| 3000/3000 [00:01<00:00, 2498.48 examples/s]
Map: 100%|██████████| 3000/3000 [00:01<00:00, 2404.16 examples/s]


In [None]:
training_args = TrainingArguments(
    output_dir = "./output",
    do_train = True,
    do_eval = True,
    num_train_epochs = 10,
    per_device_train_batch_size = 16,
    warmup_steps = 100,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
  
)


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

trainer = Trainer(
    # the pre-trained model that will be fine-tuned 
    model=model,
     # training arguments that we defined above                        
    args=training_args,
    # training and validation dataset                 
    train_dataset=enc_train,         
    eval_dataset=enc_val,            
    compute_metrics= compute_metrics,
)


result = trainer.train()

In [None]:
q=[trainer.evaluate(eval_dataset=data) for data in [enc_train, enc_val, enc_test]] 
pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]

In [None]:
# saving the best fine-tuned model & tokenizer
model_save_path = "MyBestIMDBModel"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [29]:
def get_prediction(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=250, return_tensors="pt").to(device)
    outputs = model(inputs["input_ids"].to(device),inputs["attention_mask"].to(device))
    probs = outputs[0].softmax(1)
    return probs, probs.argmax()

In [None]:
model.to(device)
text = "best movie "
get_prediction(text)[1].item()

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


In [None]:
# 1. Load the AG News dataset
dataset = load_dataset("ag_news")

# 2. Define model and tokenizer
model_name = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = 4  # AG News has 4 classes

# 3. Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# 4. Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 5. Prepare datasets
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
# 6. Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 7. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

# 8. Define metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

# 9. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,)

In [None]:

# 10. Train the model
trainer.train()

# 11. Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# 12. Save the model
trainer.save_model("./ag_news_classifier")

# 13. Test the model on a sample text
test_text = "Apple announces new iPhone model with advanced AI capabilities"
inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1)
class_names = ["World", "Sports", "Business", "Sci/Tech"]
print(f"Predicted class: {class_names[prediction.item()]}")

### Text Representation

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [31]:
model_path= "distilbert-base-uncased"
model_path = "huawei-noah/TinyBERT_General_4L_312D"
model_path = "microsoft/MiniLM-L12-H384-uncased"
data_path = "imdb"

In [None]:
# tokenizer = DistilBertTokenizer.from_pretrained(model_path)
# model = DistilBertForSequenceClassification.from_pretrained(model_path, id2label={0:"NEG", 1:"POS"}, label2id={"NEG":0, "POS":1})

tokenizer = AutoTokenizer.from_pretrained(model_path)
#model = AutoModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased")
model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label={0:"NEG", 1:"POS"}, label2id={"NEG":0, "POS":1})

In [None]:
# Get number of parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")

In [34]:
train_dataset = load_dataset(data_path,split="train[:20000]+test[:2000]")
val_dataset = load_dataset(data_path,split="train[20000:22000]+test[2000:4000]")
test_dataset = load_dataset(data_path,split="train[22000:]+test[4000:6000]")

In [None]:
tokienizer.

In [None]:
[len(tokenizer.a)]

In [None]:
enc_train = train_dataset.map(lambda e: tokenizer( e['text'], padding=True, truncation=True), batched=True, batch_size=1000) 
enc_val =  val_dataset.map(lambda e: tokenizer( e['text'], padding=True, truncation=True), batched=True, batch_size=1000) 
enc_test =   test_dataset.map(lambda e: tokenizer( e['text'], padding=True, truncation=True), batched=True, batch_size=1000) 

In [57]:
training_args = TrainingArguments(
    output_dir = "./output",
    do_train = True,
    do_eval = True,
    num_train_epochs = 10,
    per_device_train_batch_size = 16,
    warmup_steps = 100,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True
)


In [58]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [59]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned 
    model=model,
     # training arguments that we defined above                        
    args=training_args,
    # training and validation dataset                 
    train_dataset=enc_train,         
    eval_dataset=enc_val,            
    compute_metrics= compute_metrics,
)

In [None]:
trainer.train()

In [None]:
val_dataset.map(lambda x: tokenizer(x["text"],padding=True),batch_size=1000,batched=True)