In [1]:
import pandas as pd
from transformers import AutoTokenizer # Requires the 'transformers' library
import os
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- 1. USER SETTINGS ---
# The model tokenizer we will use (a general-purpose, robust model)
MODEL_NAME = 'bert-base-uncased'
# BERT models usually have a max sequence length of 512
MAX_LENGTH = 512 

# Define paths to the processed data
PROCESSED_FOLDER = '../processed_data1/'
TRAIN_PATH = os.path.join(PROCESSED_FOLDER, 'train_data_final.csv')
VAL_PATH = os.path.join(PROCESSED_FOLDER, 'val_data_final.csv')
TEST_PATH = os.path.join(PROCESSED_FOLDER, 'test_data_final.csv')

# Load the data
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

# Define text and label columns
TEXT_COL = 'text'
TITLE_COL = 'title'
LABEL_COL = 'label'
CATEGORY_ENCODED_COL = 'category_encoded'

In [3]:
# --- 2. Initialize BERT Tokenizer ---
# We download the specific tokenizer for the BERT model
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print(f"Tokenizer for {MODEL_NAME} loaded successfully.")
except Exception as e:
    print(f"Error loading tokenizer: {e}. You may need to run: pip install transformers")
    exit()

Tokenizer for bert-base-uncased loaded successfully.


In [4]:
def tokenize_data(df, tokenizer, max_len):
   
    encodings = tokenizer(
        df[TITLE_COL].astype(str).tolist(), 
        df[TEXT_COL].astype(str).tolist(), 
        truncation=True,         
        padding='max_length',    
        max_length=max_len,
        return_tensors='pt'      # Return PyTorch tensors
    )
    
    # add labels and category labels
    encodings['labels'] = torch.tensor(df[LABEL_COL].tolist(), dtype=torch.long)
    encodings['category_labels'] = torch.tensor(df[CATEGORY_ENCODED_COL].tolist(), dtype=torch.long)
    
    return encodings

In [5]:
#--- 4. Apply Tokenization to all Splits ---
print("\nStarting Tokenization...")
train_encodings = tokenize_data(train_df, tokenizer, MAX_LENGTH)
val_encodings = tokenize_data(val_df, tokenizer, MAX_LENGTH)
test_encodings = tokenize_data(test_df, tokenizer, MAX_LENGTH)

print("Tokenization Complete.")
print(f"Training Input ID Shape: {train_encodings['input_ids'].shape}")
print(f"Validation Input ID Shape: {val_encodings['input_ids'].shape}")
print(f"Testing Input ID Shape: {test_encodings['input_ids'].shape}")


Starting Tokenization...
Tokenization Complete.
Training Input ID Shape: torch.Size([13000, 512])
Validation Input ID Shape: torch.Size([3000, 512])
Testing Input ID Shape: torch.Size([4000, 512])


In [6]:
# --- 5. Save the Tokenized Data (Optional, but good practice) ---
# Saving tokenized data as a single dictionary file for easy loading in the next step
import torch

# Create a folder for the final tokenized data
tokenized_folder = '../tokenized_data/'
if not os.path.exists(tokenized_folder):
    os.makedirs(tokenized_folder)

torch.save(train_encodings, os.path.join(tokenized_folder, 'train_encodings.pt'))
torch.save(val_encodings, os.path.join(tokenized_folder, 'val_encodings.pt'))
torch.save(test_encodings, os.path.join(tokenized_folder, 'test_encodings.pt'))

print("\n--- SUCCESS ---")
print("Tokenized data (PyTorch Tensors) saved to 'tokenized_data/' folder.")


--- SUCCESS ---
Tokenized data (PyTorch Tensors) saved to 'tokenized_data/' folder.
