# Convert TREC data to JSON

In [2]:

import codecs
def _generate_examples(filepath):
    label2id = {'ABBR':0, 'DESC':1, 'ENTY':2, 'HUM':3, 'LOC':4, 'NUM':5}
    id2label = {0:'ABBR', 1:'DESC', 2:'ENTY', 3:'HUM', 4:'LOC', 5:'NUM'}
    examples = []
    with codecs.open(filepath, "rb") as f:
        for id_, row in enumerate(f):
            # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
            label, _, text = row.replace(b"\xf0",
                                         b" ").strip().decode().partition(" ")
            coarse_label, _, fine_label = label.partition(":")
            examples.append({
                'id': id_, 
                # "label-fine": fine_label,
                "text": text,
                "label": label2id[coarse_label],
                "label-coarse": coarse_label,                
            })
    return examples 

In [3]:
train = _generate_examples("train_5500.label")
test = _generate_examples("TREC_10.label")
from kiki_utils.common.helpers import *
write_json(train, "trec_train.json")
write_json(test, "trec_test.json")

In [4]:
print(f"Train: {len(train)} Test: {len(test)}")

Train: 5452 Test: 500


In [71]:
# =============================================================================
# IMPORTS
# =============================================================================
import os
import json
import numpy as np
import warnings
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)


# =============================================================================
# CONFIGURATION
# =============================================================================
# Environment variables - Set BEFORE any imports that use them
os.environ['HTTP_PROXY'] = "http://10.60.28.99:81"
os.environ['HTTPS_PROXY'] = "http://10.60.28.99:81"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Fix tokenizers parallelism warning
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Optional: disable oneDNN if not needed

# Model configuration
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "trec_model"
NUM_LABELS = 6  # Updated to 6 labels

# Updated label mappings for 6 classes
id2label = {0: 'ABBR', 1: 'DESC', 2: 'ENTY', 3: 'HUM', 4: 'LOC', 5: 'NUM'}
label2id = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

# Training hyperparameters
LEARNING_RATE = 2e-5
BATCH_SIZE = 16
NUM_EPOCHS = 4
WEIGHT_DECAY = 0.01


# =============================================================================
# DATA LOADING FUNCTIONS
# =============================================================================
def load_from_json(filename):
    """Load dataset from a JSON file and convert to Hugging Face Dataset format"""
    with open(filename, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Extract texts and labels
    texts = [item['text'] for item in data]
    labels = [item['label'] for item in data]
    
    # Convert to Hugging Face Dataset
    dataset = Dataset.from_dict({
        'text': texts,
        'label': labels
    })
    
    return dataset


# =============================================================================
# PREPROCESSING FUNCTIONS
# =============================================================================
# Note: These functions need to be defined at module level for proper serialization
def create_preprocess_function(tokenizer):
    """Create a preprocessing function with tokenizer"""
    def preprocess_function(examples):
        """Tokenize text examples"""
        return tokenizer(examples["text"], truncation=True, padding=False)
    return preprocess_function


def create_compute_metrics(accuracy_metric):
    """Create a compute metrics function with accuracy metric"""
    def compute_metrics(eval_pred):
        """Compute accuracy metrics for evaluation"""
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy_metric.compute(predictions=predictions, references=labels)
    return compute_metrics





# Training

In [72]:
# =============================================================================
# MAIN EXECUTION
# =============================================================================
if __name__ == "__main__":
    """Main training function"""
    # Load datasets
    print("\nLoading from JSON files...")
    train_dataset = load_from_json('trec_train.json')
    test_dataset = load_from_json('trec_test.json')
    print(f"Train dataset size: {len(train_dataset)}")
    print(f"Test dataset size: {len(test_dataset)}")
    
    # Initialize tokenizer
    print("\nInitializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Create preprocessing function
    preprocess_function = create_preprocess_function(tokenizer)
    
    # Tokenize datasets
    print("Tokenizing datasets...")
    tokenized_train_imdb = train_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing training data"
    )
    tokenized_test_imdb = test_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing test data"
    )
    
    # Initialize data collator and metrics
    print("\nInitializing data collator and metrics...")
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    accuracy = evaluate.load("accuracy")
    compute_metrics = create_compute_metrics(accuracy)
    
    # Initialize model
    print("\nInitializing model...")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=NUM_LABELS,  # Updated for 6 labels
        id2label=id2label,
        label2id=label2id
    )
    
    # Configure training arguments
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=NUM_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none",  # Disable all reporting including wandb
        logging_steps=500,
        logging_dir=f"{OUTPUT_DIR}/logs",
        dataloader_num_workers=0,  # Avoid multiprocessing issues
    )
    
    # Initialize trainer - use processing_class instead of tokenizer
    print("\nInitializing trainer...")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_imdb,
        eval_dataset=tokenized_test_imdb,
        processing_class=tokenizer,  # Use processing_class instead of tokenizer
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    # Start training
    print("\nStarting training...")
    train_result = trainer.train()
    
    # Save the final model
    print("\nSaving model...")
    trainer.save_model()
    
    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_results = trainer.evaluate(tokenized_test_imdb)
    print(f"Test results: {test_results}")
    
    print("\nTraining completed successfully!")


Loading from JSON files...
Train dataset size: 5452
Test dataset size: 500

Initializing tokenizer...
Tokenizing datasets...


Tokenizing training data:   0%|          | 0/5452 [00:00<?, ? examples/s]

Tokenizing test data:   0%|          | 0/500 [00:00<?, ? examples/s]


Initializing data collator and metrics...

Initializing model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.



Initializing trainer...

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.201156,0.948
2,0.504300,0.145282,0.968
3,0.110500,0.156016,0.97
4,0.110500,0.151031,0.97



Saving model...

Evaluating on test set...


Test results: {'eval_loss': 0.14528198540210724, 'eval_accuracy': 0.968, 'eval_runtime': 0.2597, 'eval_samples_per_second': 1925.316, 'eval_steps_per_second': 123.22, 'epoch': 4.0}

Training completed successfully!


In [73]:
test_results

{'eval_loss': 0.14528198540210724,
 'eval_accuracy': 0.968,
 'eval_runtime': 0.2597,
 'eval_samples_per_second': 1925.316,
 'eval_steps_per_second': 123.22,
 'epoch': 4.0}

In [74]:
! ls trec_model

checkpoint-1023  checkpoint-682     special_tokens_map.json  training_args.bin
checkpoint-1364  config.json	    tokenizer_config.json    vocab.txt
checkpoint-341	 model.safetensors  tokenizer.json


# Load and inference

In [75]:
# =============================================================================
# IMPORTS
# =============================================================================
import os
import json
import torch
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score  # For manual accuracy calculation

# =============================================================================
# CONFIGURATION
# =============================================================================
# Environment variables - Set BEFORE any imports that use them
os.environ['HTTP_PROXY'] = "http://10.60.28.99:81"
os.environ['HTTPS_PROXY'] = "http://10.60.28.99:81"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # Fix tokenizers parallelism warning
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Optional: disable oneDNN if not needed

# Model configuration
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "trec_model"
CHECKPOINT_PATH = os.path.join(OUTPUT_DIR, "checkpoint-1364")  # Path to the checkpoint
NUM_LABELS = 6  # Number of classes

# Updated label mappings for 6 classes
id2label = {0: 'ABBR', 1: 'DESC', 2: 'ENTY', 3: 'HUM', 4: 'LOC', 5: 'NUM'}
label2id = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}



# =============================================================================
# MAIN EXECUTION
# =============================================================================
if __name__ == "__main__":
    """Load the model, perform inference and calculate accuracy"""
    # Load the test dataset (trec_test.json)
    print("\nLoading from trec_test.json...")
    test_dataset = load_from_json('trec_test.json')
    print(f"Test dataset size: {len(test_dataset)}")
    
    # Initialize tokenizer
    print("\nInitializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Tokenize the test dataset
    print("Tokenizing test data...")
    def preprocess_function(examples):
        """Tokenize text examples"""
        return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)
    
    tokenized_test_data = test_dataset.map(
        preprocess_function, 
        batched=True,
        desc="Tokenizing test data"
    )

    # Load the model from checkpoint
    print("\nLoading model from checkpoint...")
    model = AutoModelForSequenceClassification.from_pretrained(
        CHECKPOINT_PATH,
        num_labels=NUM_LABELS,
        id2label=id2label,
        label2id=label2id
    )
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Define a custom collate function for DataLoader
    def collate_fn(batch):
        """Collate function for DataLoader"""
        return {
            'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
            'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
            'labels': torch.tensor([item['label'] for item in batch])
        }

    # Use DataLoader for batching
    from torch.utils.data import DataLoader
    test_dataloader = DataLoader(tokenized_test_data, batch_size=16, collate_fn=collate_fn)

    # Run inference on the test dataset
    print("\nPerforming inference on the test dataset...")
    predictions = []
    true_labels = []

    for batch in test_dataloader:
        # Move data to the appropriate device (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()
        
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
    
    # Compute accuracy manually
    accuracy = accuracy_score(true_labels, predictions)
    print(f"Accuracy: {accuracy * 100:.2f}%")



Loading from trec_test.json...
Test dataset size: 500

Initializing tokenizer...
Tokenizing test data...


Tokenizing test data:   0%|          | 0/500 [00:00<?, ? examples/s]


Loading model from checkpoint...

Performing inference on the test dataset...
Accuracy: 97.00%


# Load and inference using pipeline

In [76]:
def load_classifier():
    # Single example 
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
    import torch

    # Define the model and checkpoint paths
    MODEL_PATH = "trec_model/checkpoint-682"  # Adjust path to your model checkpoint

    # Load the pipeline for multi-class classification (6 classes)
    classifier = pipeline(
        "text-classification",
        model=MODEL_PATH,
        tokenizer=MODEL_PATH,
        device=0 if torch.cuda.is_available() else -1  # Use GPU if available
    )
    return classifier


def load_model_and_tokenizer():
    MODEL_PATH = "trec_model/checkpoint-682"  # Adjust path to your model checkpoint
    # Optionally, if you want to use the model and tokenizer directly:
    # 1. Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
    return model, tokenizer

In [77]:
def classify(TEXT, classifier):
    # Classify the text
    predictions = classifier(TEXT)

    # Print the predictions
    print(predictions)
    
def classify2(TEXT, model, tokenizer):
    # 2. Prepare inputs
    inputs = tokenizer(TEXT, return_tensors="pt")

    # 3. Run inference (with no gradient computation)
    with torch.no_grad():
        logits = model(**inputs).logits

    # 4. Get the predicted class id and the label
    predicted_class_id = logits.argmax().item()
    predicted_class = model.config.id2label[predicted_class_id]
    print(logits)
    print(f"Predicted class: {predicted_class} (ID: {predicted_class_id})")
    
import torch
import torch.nn.functional as F

def classify2(TEXT, model, tokenizer):
    """
    Classify text and print detailed scores for all classes
    
    Args:
        TEXT: Input text to classify
        model: Trained model
        tokenizer: Tokenizer
    """
    # 1. Prepare inputs
    inputs = tokenizer(TEXT, return_tensors="pt")
    
    # Move to same device as model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 2. Run inference (with no gradient computation)
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # 3. Convert logits to probabilities using softmax
    probabilities = F.softmax(logits, dim=-1)
    
    # 4. Get the predicted class id and the label
    predicted_class_id = logits.argmax().item()
    predicted_class = model.config.id2label[predicted_class_id]
    confidence = probabilities[0][predicted_class_id].item()
    
    # 5. Print results
    print(f"\nInput text: {TEXT}")
    print(f"\n{'='*60}")
    print(f"Predicted class: {predicted_class} (ID: {predicted_class_id})")
    print(f"Confidence: {confidence:.4f} ({confidence*100:.2f}%)")
    print(f"{'='*60}")
    
    print(f"\nRaw logits:")
    print(logits)
    
    print(f"\nScores for all classes:")
    print(f"{'Class':<15} {'Label':<12} {'Probability':<12} {'Percentage'}")
    print(f"{'-'*60}")
    
    # Sort by probability (descending)
    probs_list = probabilities[0].cpu().numpy()
    sorted_indices = probs_list.argsort()[::-1]
    
    for idx in sorted_indices:
        class_label = model.config.id2label[idx]
        prob = probs_list[idx]
        marker = "★" if idx == predicted_class_id else " "
        print(f"{marker} Class {idx:<7} {class_label:<12} {prob:<12.6f} {prob*100:>6.2f}%")
    
    return {
        'predicted_class': predicted_class,
        'predicted_class_id': predicted_class_id,
        'confidence': confidence,
        'probabilities': probs_list,
        'logits': logits[0].cpu().numpy()
    }


# Alternative: More compact version
def classify2_compact(TEXT, model, tokenizer):
    """
    Compact version with scores
    """
    inputs = tokenizer(TEXT, return_tensors="pt")
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    probabilities = F.softmax(logits, dim=-1)[0]
    predicted_class_id = logits.argmax().item()
    
    print(f"\nText: {TEXT}")
    print(f"\nPrediction: {model.config.id2label[predicted_class_id]} ({probabilities[predicted_class_id]:.2%})")
    print(f"\nAll scores:")
    for idx, prob in enumerate(probabilities):
        label = model.config.id2label[idx]
        print(f"  {label:10s}: {prob:.4f} ({prob:.2%})")


# Alternative: With visualization
def classify2_visual(TEXT, model, tokenizer):
    """
    Version with visual bar chart
    """
    import torch.nn.functional as F
    
    inputs = tokenizer(TEXT, return_tensors="pt")
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    probabilities = F.softmax(logits, dim=-1)[0]
    predicted_class_id = logits.argmax().item()
    
    print(f"\nText: {TEXT}")
    print(f"\n{'='*70}")
    print(f"Prediction: {model.config.id2label[predicted_class_id]}")
    print(f"{'='*70}\n")
    
    # Sort by probability
    probs_sorted = sorted(
        enumerate(probabilities.cpu().numpy()),
        key=lambda x: x[1],
        reverse=True
    )
    
    for idx, prob in probs_sorted:
        label = model.config.id2label[idx]
        bar_length = int(prob * 50)  # Scale to 50 chars max
        bar = '█' * bar_length
        marker = '→' if idx == predicted_class_id else ' '
        print(f"{marker} {label:10s} {prob:6.2%} |{bar}")
    
    return probabilities.cpu().numpy()    

In [78]:
classifier = load_classifier()
classify("Who is Newton", classifier)

Device set to use cuda:0


[{'label': 'HUM', 'score': 0.9944992065429688}]


In [79]:
classify(["The number is 5", 
          "How many students in the class"], classifier)

[{'label': 'NUM', 'score': 0.9761038422584534}, {'label': 'NUM', 'score': 0.9943133592605591}]


In [80]:
model, tokenizer = load_model_and_tokenizer()

In [81]:
classify2("The number is 5", model, tokenizer)
# classify2_compact("The number is 5", model, tokenizer)
# classify2_visual("The number is 5", model, tokenizer)



Input text: The number is 5

Predicted class: NUM (ID: 5)
Confidence: 0.9761 (97.61%)

Raw logits:
tensor([[-1.2781, -1.2032, -0.8557, -0.9768, -1.0826,  4.2516]])

Scores for all classes:
Class           Label        Probability  Percentage
------------------------------------------------------------
★ Class 5       NUM          0.976104      97.61%
  Class 2       ENTY         0.005908       0.59%
  Class 3       HUM          0.005234       0.52%
  Class 4       LOC          0.004709       0.47%
  Class 1       DESC         0.004174       0.42%
  Class 0       ABBR         0.003872       0.39%


{'predicted_class': 'NUM',
 'predicted_class_id': 5,
 'confidence': 0.9761038422584534,
 'probabilities': array([0.00387231, 0.00417374, 0.00590769, 0.00523388, 0.00470855,
        0.97610384], dtype=float32),
 'logits': array([-1.2781407 , -1.2031807 , -0.8557379 , -0.97684044, -1.0826124 ,
         4.251576  ], dtype=float32)}

In [84]:
def evaluate_test():
    # Predict multi example
    from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
    import json
    from sklearn.metrics import accuracy_score

    # Define the model checkpoint path
    MODEL_PATH = "trec_model/checkpoint-1364"  # Update to your checkpoint path

    # Load the pipeline for multi-class classification (6 classes)
    print("\nLoading model and tokenizer...")
    classifier = pipeline(
        "text-classification",
        model=MODEL_PATH,
        tokenizer=MODEL_PATH,
        device=0 if torch.cuda.is_available() else -1  # Use GPU if available
    )
    dataset = load_from_json('trec_test.json')

    # Load texts and labels from trec_test.json
    test_texts, true_labels = dataset["text"], dataset["label"]

    # Perform predictions using the pipeline
    print("\nClassifying text data from trec_test.json...")

    # Classify the texts
    predictions = classifier(test_texts)

    # Extract predicted labels (the 'label' field from the predictions)
    predicted_labels = [prediction['label'] for prediction in predictions]

    # Print out the predictions and compute accuracy
    correct_predictions = sum([1 for true, pred in zip(true_labels, predicted_labels) if true == label2id[pred]])
    total_predictions = len(true_labels)
    accuracy = correct_predictions / total_predictions

    # Print the accuracy
    print(f"\nAccuracy: {accuracy * 100:.2f}%")

    # Optionally, if you want


In [85]:
evaluate_test()

Device set to use cuda:0



Loading model and tokenizer...

Classifying text data from trec_test.json...

Accuracy: 97.00%
