#### **Import Libraries**

In [1]:
# Standard library imports
import os

# Deep learning libraries
import torch
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding

# Text processing libraries
import tiktoken
import csv
import pandas as pd

# Utility libraries
import numpy as np
import random
import math
from tqdm import tqdm
from itertools import chain
from IPython.display import display, Markdown
import textwrap

# Custom libraries  
from llmft.train import EncoderTrainer, EarlyStopping
from llmft.metrics import compute_recall, compute_f1_score
from llmft.losses import FocalLoss
from llmft.utils import predict

# Visualization libraries
import seaborn as sns  # Assuming seaborn is installed

# NLP utility (assuming trics is a library/module)
from trics.nlp.utils import to_markdown

# Configure GPU usage and tokenizer parallelism
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Dataset libraries (can be grouped together)
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
import torch.nn.utils as utils

#### **Parameters**

In [2]:
seed = 1                        # Seed
sample_size = 10_000              # Sample Size
val_set_fraction = 0.25         # Fraction of sample used for validation set
lr = 2e-4                       # Optimizer learning rate
warmup_ratio = 0.25             # Fraction of training epochs used for learning rate warm up
batch_size = 32                 # Number of observations in each mini-batch
epochs = 50                     # Number of training epochs
patience = 30                    
gamma = 0.0


#### **Set Up Plotting**

In [3]:
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib import rcParams
rcParams['image.interpolation'] = 'nearest'
rcParams['image.cmap'] = 'viridis'
rcParams['axes.grid'] = False
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
plt.style.use('seaborn-v0_8-dark-palette')

from matplotlib import font_manager 
locations = './../../../styles/Newsreader'
font_files = font_manager.findSystemFonts(fontpaths=locations)
print(locations)
print(font_files[0])
for f in font_files: 
    font_manager.fontManager.addfont(f)
plt.rcParams["font.family"] = "Newsreader"

./../../../styles/Newsreader
/home/ubuntu/llmft/styles/Newsreader/static/Newsreader_36pt/Newsreader_36pt-ExtraLightItalic.ttf


#### **Set Up Device**

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


#### **Tokenizer**

In [5]:
model_id = "roberta-base"#"distilbert/distilbert-base-uncased"#
tokenizer = AutoTokenizer.from_pretrained(model_id)
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


In [6]:
imdb = load_dataset("imdb")

# Define the sample size
sample_size = 5000

# Function to sample a fixed number of observations
def sample_dataset(dataset, sample_size):
    # Ensure the sample size is not larger than the dataset
    assert sample_size <= len(dataset), "Sample size is larger than the dataset size"
    # Get a random sample of indices
    random_indices = random.sample(range(len(dataset)), sample_size)
    # Select the sampled indices from the dataset
    return dataset.select(random_indices)

# Sample the datasets
train_dataset = sample_dataset(imdb['train'], sample_size)
test_dataset = sample_dataset(imdb['test'], sample_size)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text'])


tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['text'])

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [7]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=3130, training_loss=0.07916956519166501, metrics={'train_runtime': 1825.7562, 'train_samples_per_second': 27.386, 'train_steps_per_second': 1.714, 'total_flos': 1.29683328076704e+16, 'train_loss': 0.07916956519166501, 'epoch': 10.0})

In [12]:
training_logs = trainer.state.log_history
for log in training_logs:
    if 'loss' in log:
        print(f"Epoch {log['epoch']}: Training Loss = {log['loss']}")
    if 'eval_loss' in log:
        print(f"Epoch {log['epoch']}: Validation Loss = {log['eval_loss']}")

Epoch 1.0: Validation Loss = 0.1630711704492569
Epoch 1.5974440894568689: Training Loss = 0.2618
Epoch 2.0: Validation Loss = 0.20399773120880127
Epoch 3.0: Validation Loss = 0.21570581197738647
Epoch 3.194888178913738: Training Loss = 0.1272
Epoch 4.0: Validation Loss = 0.30965664982795715
Epoch 4.792332268370607: Training Loss = 0.0551
Epoch 5.0: Validation Loss = 0.3033478558063507
Epoch 6.0: Validation Loss = 0.3031342625617981
Epoch 6.389776357827476: Training Loss = 0.0292
Epoch 7.0: Validation Loss = 0.3764845132827759
Epoch 7.987220447284345: Training Loss = 0.0102
Epoch 8.0: Validation Loss = 0.39434653520584106
Epoch 9.0: Validation Loss = 0.40157368779182434
Epoch 9.584664536741213: Training Loss = 0.0094
Epoch 10.0: Validation Loss = 0.404709130525589


In [None]:
 trainer.state.log_history