In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from tqdm import tqdm  # Import tqdm for progress bar

# Path to your checkpoint folder
checkpoint_path = "./intfloat_best_model_dataaug/checkpoint-2820" 

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# Load model (automatically detects safetensors)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
model.eval()  # Set model to evaluation mode

# Load dataset
df = pd.read_csv("data/test_without_labels.csv")
df = df.drop(columns='Usage')

# Tokenize input texts
inputs = tokenizer(df["Text"].tolist(), truncation=True, padding=True, max_length=100, return_tensors="pt")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Create PyTorch Dataset and DataLoader (Batch size = 128)
dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
dataloader = DataLoader(dataset, batch_size=128)

# Perform inference with tqdm progress bar
predictions = []
with torch.no_grad():
    for batch in tqdm(dataloader, desc="🔍 Running Inference", unit="batch"):
        batch = [tensor.to(device) for tensor in batch]
        input_ids, attention_mask = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(preds)

# Save predictions
df["Label"] = predictions

print("✅ Inference complete! Predictions saved to df 🎯")


In [None]:
import pandas as pd
import json

PATH_MAPPING = "data/mapping/label_mappings_dataaug.json"

with open(PATH_MAPPING, "r") as f:
    mappings = json.load(f)

id2label = mappings["id2label"] 

id2label = {int(k): v for k, v in id2label.items()}

df["Label"] = df["Label"].map(id2label)

In [None]:
# Display first 20 rows
(df.head(10))

In [None]:
CHEKC_OTHER_DF = False
if CHEKC_OTHER_DF:
    df2 = pd.read_csv("data/test_with_labels.csv")
    num_differences = (df["Label"] != df2["Label"]).sum()
    print(f"🔍 Number of different labels: {num_differences}")

In [None]:
df["ID"] = range(1,len(df)+1)  # Creates a sequential ID column
df = df[["ID", "Label"]]
df.head()

In [None]:
import os

PATH_FILE = "second_submit.csv"
if os.path.exists(PATH_FILE):
    print("Error: File 'second_submit.csv' already exists.")
else:
    df.to_csv(PATH_FILE, index=False)
    print("✅ Submission file saved as second_submit.csv 📄")

In [None]:
ok_df = pd.read_csv(PATH_FILE)
ok_df.shape