In [None]:
# =============================================================================
# IMPORTS
# =============================================================================
import os
os.environ['HTTP_PROXY'] = "http://10.60.28.99:81"
os.environ['HTTPS_PROXY'] = "http://10.60.28.99:81"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import json
import numpy as np
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


# =============================================================================
# CONFIGURATION
# =============================================================================

# Model configuration
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "my_awesome_model_3"
NUM_LABELS = 2

# Label mappings
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Training hyperparameters
LEARNING_RATE = 2e-5
BATCH_SIZE = 16
NUM_EPOCHS = 2
WEIGHT_DECAY = 0.01


# =============================================================================
# DATA LOADING FUNCTIONS
# =============================================================================
def load_from_json(filename):
    """Load dataset from a JSON file and convert to Hugging Face Dataset format"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract texts and labels
    texts = [item['text'] for item in data]
    labels = [item['label'] for item in data]
    
    # Convert to Hugging Face Dataset
    dataset = Dataset.from_dict({
        'text': texts,
        'label': labels
    })
    
    return dataset


# =============================================================================
# PREPROCESSING FUNCTIONS
# =============================================================================
def preprocess_function(examples):
    """Tokenize text examples"""
    return tokenizer(examples["text"], truncation=True)


def compute_metrics(eval_pred):
    """Compute accuracy metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


# =============================================================================
# MAIN EXECUTION
# =============================================================================
if __name__ == "__main__":
    # Load datasets
    print("\nLoading from JSON files...")
    train_dataset = load_from_json('imdb_train.json')
    test_dataset = load_from_json('imdb_test.json')
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")
    
    # Initialize tokenizer
    print("\nInitializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Tokenize datasets
    print("Tokenizing datasets...")
    tokenized_train_imdb = train_dataset.map(preprocess_function, batched=True)
    tokenized_test_imdb = test_dataset.map(preprocess_function, batched=True)
    
    # Initialize data collator and metrics
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    accuracy = evaluate.load("accuracy")
    
    # Initialize model
    print("\nInitializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        id2label=id2label,
        label2id=label2id
    )
    
    # Configure training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        deepspeed=None,
        report_to="none",  # Disable all reporting including wandb
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_imdb,
        eval_dataset=tokenized_test_imdb,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Start training
    print("\nStarting training...")
    trainer.train()
    
    print("\nTraining completed!")

In [1]:
# =============================================================================
# IMPORTS
# =============================================================================
import os
import json
import numpy as np
import warnings
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)


# =============================================================================
# CONFIGURATION
# =============================================================================
# Environment variables - Set BEFORE any imports that use them
os.environ['HTTP_PROXY'] = "http://10.60.28.99:81"
os.environ['HTTPS_PROXY'] = "http://10.60.28.99:81"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Fix tokenizers parallelism warning
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Optional: disable oneDNN if not needed

# Model configuration
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "my_awesome_model_2"
NUM_LABELS = 2

# Label mappings
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# Training hyperparameters
LEARNING_RATE = 2e-5
BATCH_SIZE = 16
NUM_EPOCHS = 2
WEIGHT_DECAY = 0.01


# =============================================================================
# DATA LOADING FUNCTIONS
# =============================================================================
def load_from_json(filename):
    """Load dataset from a JSON file and convert to Hugging Face Dataset format"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract texts and labels
    texts = [item['text'] for item in data]
    labels = [item['label'] for item in data]
    
    # Convert to Hugging Face Dataset
    dataset = Dataset.from_dict({
        'text': texts,
        'label': labels
    })
    
    return dataset


# =============================================================================
# PREPROCESSING FUNCTIONS
# =============================================================================
# Note: These functions need to be defined at module level for proper serialization
def create_preprocess_function(tokenizer):
    """Create a preprocessing function with tokenizer"""
    def preprocess_function(examples):
        """Tokenize text examples"""
        return tokenizer(examples["text"], truncation=True, padding=False)
    return preprocess_function


def create_compute_metrics(accuracy_metric):
    """Create a compute metrics function with accuracy metric"""
    def compute_metrics(eval_pred):
        """Compute accuracy metrics for evaluation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy_metric.compute(predictions=predictions, references=labels)
    return compute_metrics


# =============================================================================
# MAIN EXECUTION
# =============================================================================
def main():
    """Main training function"""
    # Load datasets
    print("\nLoading from JSON files...")
    train_dataset = load_from_json('imdb_train.json')
    test_dataset = load_from_json('imdb_test.json')
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")
    
    # Initialize tokenizer
    print("\nInitializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Create preprocessing function
    preprocess_function = create_preprocess_function(tokenizer)
    
    # Tokenize datasets
    print("Tokenizing datasets...")
    tokenized_train_imdb = train_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing training data"
    )
    tokenized_test_imdb = test_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing test data"
    )
    
    # Initialize data collator and metrics
    print("\nInitializing data collator and metrics...")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    accuracy = evaluate.load("accuracy")
    compute_metrics = create_compute_metrics(accuracy)
    
    # Initialize model
    print("\nInitializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,
        id2label=id2label,
        label2id=label2id
    )
    
    # Configure training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none",  # Disable all reporting including wandb
        logging_steps=500,
        logging_dir=f"{OUTPUT_DIR}/logs",
        dataloader_num_workers=0,  # Avoid multiprocessing issues
    )
    
    # Initialize trainer - use processing_class instead of tokenizer
    print("\nInitializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_imdb,
        eval_dataset=tokenized_test_imdb,
        processing_class=tokenizer,  # Use processing_class instead of tokenizer
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Start training
    print("\nStarting training...")
    train_result = trainer.train()
    
    # Save the final model
    print("\nSaving model...")
    trainer.save_model()
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_results = trainer.evaluate(tokenized_test_imdb)
    print(f"Test results: {test_results}")
    
    print("\nTraining completed successfully!")
    return trainer, test_results


if __name__ == "__main__":
    trainer, results = main()

2025-11-11 21:56:02.986299: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-11 21:56:02.988048: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-11 21:56:03.023675: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



Loading from JSON files...
Train dataset size: 25000
Test dataset size: 25000

Initializing tokenizer...




Tokenizing datasets...


Tokenizing training data:   0%|          | 0/25000 [00:00<?, ? examples/s]

Tokenizing test data:   0%|          | 0/25000 [00:00<?, ? examples/s]


Initializing data collator and metrics...

Initializing model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.



Initializing trainer...
[2025-11-11 21:56:38,829] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/data01/kilm/miniconda3/envs/quocvh/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status



Starting training...


OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 0 has a total capacity of 23.50 GiB of which 31.88 MiB is free. Process 52183 has 4.00 GiB memory in use. Process 2651 has 3.20 GiB memory in use. Process 4525 has 3.10 GiB memory in use. Process 51223 has 3.75 GiB memory in use. Process 43820 has 3.74 GiB memory in use. Process 14943 has 3.74 GiB memory in use. Including non-PyTorch memory, this process has 1.90 GiB memory in use. Of the allocated memory 1.61 GiB is allocated by PyTorch, and 23.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [12]:
import codecs
def _generate_examples(filepath):
    label2id = {'ABBR':0, 'DESC':1, 'ENTY':2, 'HUM':3, 'LOC':4, 'NUM':5}
    id2label = {0:'ABBR', 1:'DESC', 2:'ENTY', 3:'HUM', 4:'LOC', 5:'NUM'}
    examples = []
    with codecs.open(filepath, "rb") as f:
        for id_, row in enumerate(f):
            # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
            label, _, text = row.replace(b"\xf0",
                                         b" ").strip().decode().partition(" ")
            coarse_label, _, fine_label = label.partition(":")
            examples.append({
                'id': id_, 
                # "label-fine": fine_label,
                "text": text,
                "label": label2id[coarse_label],
                "label-coarse": coarse_label,                
            })
    return examples 

In [18]:
train = _generate_examples("train_5500.label")
test = _generate_examples("TREC_10.label")
from kiki_utils.common.helpers import *
write_json(train, "trec_train.json")
write_json(test, "trec_test.json")

In [1]:
# =============================================================================
# IMPORTS
# =============================================================================
import os
import json
import numpy as np
import warnings
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)


# =============================================================================
# CONFIGURATION
# =============================================================================
# Environment variables - Set BEFORE any imports that use them
os.environ['HTTP_PROXY'] = "http://10.60.28.99:81"
os.environ['HTTPS_PROXY'] = "http://10.60.28.99:81"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Fix tokenizers parallelism warning
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Optional: disable oneDNN if not needed

# Model configuration
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "my_awesome_model_2"
NUM_LABELS = 6  # Updated to 6 labels

# Updated label mappings for 6 classes
id2label = {0: 'ABBR', 1: 'DESC', 2: 'ENTY', 3: 'HUM', 4: 'LOC', 5: 'NUM'}
label2id = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

# Training hyperparameters
LEARNING_RATE = 2e-5
BATCH_SIZE = 16
NUM_EPOCHS = 2
WEIGHT_DECAY = 0.01


# =============================================================================
# DATA LOADING FUNCTIONS
# =============================================================================
def load_from_json(filename):
    """Load dataset from a JSON file and convert to Hugging Face Dataset format"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract texts and labels
    texts = [item['text'] for item in data]
    labels = [item['label'] for item in data]
    
    # Convert to Hugging Face Dataset
    dataset = Dataset.from_dict({
        'text': texts,
        'label': labels
    })
    
    return dataset


# =============================================================================
# PREPROCESSING FUNCTIONS
# =============================================================================
# Note: These functions need to be defined at module level for proper serialization
def create_preprocess_function(tokenizer):
    """Create a preprocessing function with tokenizer"""
    def preprocess_function(examples):
        """Tokenize text examples"""
        return tokenizer(examples["text"], truncation=True, padding=False)
    return preprocess_function


def create_compute_metrics(accuracy_metric):
    """Create a compute metrics function with accuracy metric"""
    def compute_metrics(eval_pred):
        """Compute accuracy metrics for evaluation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy_metric.compute(predictions=predictions, references=labels)
    return compute_metrics


# =============================================================================
# MAIN EXECUTION
# =============================================================================
def main():
    """Main training function"""
    # Load datasets
    print("\nLoading from JSON files...")
    train_dataset = load_from_json('trec_train.json')
    test_dataset = load_from_json('trec_test.json')
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")
    
    # Initialize tokenizer
    print("\nInitializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Create preprocessing function
    preprocess_function = create_preprocess_function(tokenizer)
    
    # Tokenize datasets
    print("Tokenizing datasets...")
    tokenized_train_imdb = train_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing training data"
    )
    tokenized_test_imdb = test_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing test data"
    )
    
    # Initialize data collator and metrics
    print("\nInitializing data collator and metrics...")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    accuracy = evaluate.load("accuracy")
    compute_metrics = create_compute_metrics(accuracy)
    
    # Initialize model
    print("\nInitializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,  # Updated for 6 labels
        id2label=id2label,
        label2id=label2id
    )
    
    # Configure training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none",  # Disable all reporting including wandb
        logging_steps=500,
        logging_dir=f"{OUTPUT_DIR}/logs",
        dataloader_num_workers=0,  # Avoid multiprocessing issues
    )
    
    # Initialize trainer - use processing_class instead of tokenizer
    print("\nInitializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_imdb,
        eval_dataset=tokenized_test_imdb,
        processing_class=tokenizer,  # Use processing_class instead of tokenizer
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Start training
    print("\nStarting training...")
    train_result = trainer.train()
    
    # Save the final model
    print("\nSaving model...")
    trainer.save_model()
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_results = trainer.evaluate(tokenized_test_imdb)
    print(f"Test results: {test_results}")
    
    print("\nTraining completed successfully!")
    return trainer, test_results


if __name__ == "__main__":
    trainer, results = main()


2025-11-11 22:18:11.573940: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-11 22:18:11.575665: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-11 22:18:11.610053: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



Loading from JSON files...
Train dataset size: 5452
Test dataset size: 500

Initializing tokenizer...




Tokenizing datasets...


Tokenizing training data:   0%|          | 0/5452 [00:00<?, ? examples/s]

Tokenizing test data:   0%|          | 0/500 [00:00<?, ? examples/s]


Initializing data collator and metrics...

Initializing model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.



Initializing trainer...
[2025-11-11 22:18:19,771] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/data01/kilm/miniconda3/envs/quocvh/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status



Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.189506,0.948
2,0.523200,0.154685,0.966



Saving model...

Evaluating on test set...


Test results: {'eval_loss': 0.15468460321426392, 'eval_accuracy': 0.966, 'eval_runtime': 0.256, 'eval_samples_per_second': 1952.923, 'eval_steps_per_second': 124.987, 'epoch': 2.0}

Training completed successfully!


In [3]:
results

{'eval_loss': 0.15468460321426392,
 'eval_accuracy': 0.966,
 'eval_runtime': 0.256,
 'eval_samples_per_second': 1952.923,
 'eval_steps_per_second': 124.987,
 'epoch': 2.0}

In [8]:
# =============================================================================
# IMPORTS
# =============================================================================
import os
import json
import torch
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score  # For manual accuracy calculation

# =============================================================================
# CONFIGURATION
# =============================================================================
# Environment variables - Set BEFORE any imports that use them
os.environ['HTTP_PROXY'] = "http://10.60.28.99:81"
os.environ['HTTPS_PROXY'] = "http://10.60.28.99:81"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Fix tokenizers parallelism warning
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Optional: disable oneDNN if not needed

# Model configuration
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "my_awesome_model_2"
CHECKPOINT_PATH = os.path.join(OUTPUT_DIR, "checkpoint-682")  # Path to the checkpoint
NUM_LABELS = 6  # Number of classes

# Updated label mappings for 6 classes
id2label = {0: 'ABBR', 1: 'DESC', 2: 'ENTY', 3: 'HUM', 4: 'LOC', 5: 'NUM'}
label2id = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

# =============================================================================
# DATA LOADING FUNCTION
# =============================================================================
def load_from_json(filename):
    """Load dataset from a JSON file and convert to Hugging Face Dataset format"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract texts and labels
    texts = [item['text'] for item in data]
    labels = [item['label'] for item in data]
    
    # Convert to Hugging Face Dataset
    dataset = Dataset.from_dict({
        'text': texts,
        'label': labels
    })
    
    return dataset


# =============================================================================
# MAIN EXECUTION
# =============================================================================
def main():
    """Load the model, perform inference and calculate accuracy"""
    # Load the test dataset (trec_test.json)
    print("\nLoading from trec_test.json...")
    test_dataset = load_from_json('trec_test.json')
    print(f"Test dataset size: {len(test_dataset)}")
    
    # Initialize tokenizer
    print("\nInitializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Tokenize the test dataset
    print("Tokenizing test data...")
    def preprocess_function(examples):
        """Tokenize text examples"""
        return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)
    
    tokenized_test_data = test_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing test data"
    )

    # Load the model from checkpoint
    print("\nLoading model from checkpoint...")
    model = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT_PATH,
        num_labels=NUM_LABELS,
        id2label=id2label,
        label2id=label2id
    )
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Define a custom collate function for DataLoader
    def collate_fn(batch):
        """Collate function for DataLoader"""
        return {
            'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
            'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
            'labels': torch.tensor([item['label'] for item in batch])
        }

    # Use DataLoader for batching
    from torch.utils.data import DataLoader
    test_dataloader = DataLoader(tokenized_test_data, batch_size=16, collate_fn=collate_fn)

    # Run inference on the test dataset
    print("\nPerforming inference on the test dataset...")
    predictions = []
    true_labels = []

    for batch in test_dataloader:
        # Move data to the appropriate device (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
    
    # Compute accuracy manually
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy * 100:.2f}%")

    return accuracy


if __name__ == "__main__":
    accuracy = main()



Loading from trec_test.json...
Test dataset size: 500

Initializing tokenizer...
Tokenizing test data...


Tokenizing test data:   0%|          | 0/500 [00:00<?, ? examples/s]


Loading model from checkpoint...

Performing inference on the test dataset...
Accuracy: 96.60%


In [9]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Define the model and checkpoint paths
MODEL_PATH = "my_awesome_model_2/checkpoint-682"  # Adjust path to your model checkpoint
TEXT = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

# Load the pipeline for multi-class classification (6 classes)
classifier = pipeline(
    "text-classification",
    model=MODEL_PATH,
    tokenizer=MODEL_PATH,
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

# Classify the text
predictions = classifier(TEXT)

# Print the predictions
print(predictions)

# Optionally, if you want to use the model and tokenizer directly:
# 1. Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)

# 2. Prepare inputs
inputs = tokenizer(TEXT, return_tensors="pt")

# 3. Run inference (with no gradient computation)
with torch.no_grad():
    logits = model(**inputs).logits

# 4. Get the predicted class id and the label
predicted_class_id = logits.argmax().item()
predicted_class = model.config.id2label[predicted_class_id]

print(f"Predicted class: {predicted_class} (ID: {predicted_class_id})")


Device set to use cuda:0


[{'label': 'ENTY', 'score': 0.936180830001831}]
Predicted class: ENTY (ID: 2)


In [11]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import json
from sklearn.metrics import accuracy_score

# Define the model checkpoint path
MODEL_PATH = "my_awesome_model_2/checkpoint-682"  # Update to your checkpoint path

# Load the pipeline for multi-class classification (6 classes)
print("\nLoading model and tokenizer...")
classifier = pipeline(
    "text-classification",
    model=MODEL_PATH,
    tokenizer=MODEL_PATH,
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

# Load the test dataset from trec_test.json
def load_from_json(filename):
    """Load dataset from a JSON file and return texts and labels"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract texts and labels
    texts = [item['text'] for item in data]
    labels = [item['label'] for item in data]  # Assuming 'label' field exists
    return texts, labels

# Load texts and labels from trec_test.json
test_texts, true_labels = load_from_json('trec_test.json')

# Perform predictions using the pipeline
print("\nClassifying text data from trec_test.json...")

# Classify the texts
predictions = classifier(test_texts)

# Extract predicted labels (the 'label' field from the predictions)
predicted_labels = [prediction['label'] for prediction in predictions]

# Print out the predictions and compute accuracy
correct_predictions = sum([1 for true, pred in zip(true_labels, predicted_labels) if true == label2id[pred]])
total_predictions = len(true_labels)
accuracy = correct_predictions / total_predictions

# Print the accuracy
print(f"\nAccuracy: {accuracy * 100:.2f}%")

# Optionally, if you want


Device set to use cuda:0



Loading model and tokenizer...

Classifying text data from trec_test.json...

Accuracy: 96.60%
