In [2]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
import numpy as np
import time

In [1]:
!pip install pandas scikit-learn torch transformers

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# prepare data
import pandas as pd
import ast

df = pd.read_csv(r"E:\Sem 7 Project\Datasets\Preprocessed Datasets\output_part_1.csv")
df['question_text'] = df['question_text'].apply(ast.literal_eval).apply(lambda x: ' '.join(x))
df['topic'] = df['topic'].apply(ast.literal_eval).apply(lambda x: x[0] if len(x) > 0 else '')
print(df[['question_text', 'topic']].head())

                                       question_text      topic
0  organ choos concern eye stage three point desp...    quantum
1  certainli turn marriag consid environ school p...     climat
2                       marriag best film dream wish  psycholog
3  guess soon hair compani letter fund mission me...    histori
4  know say congress view wear amount money succe...     machin


In [10]:
# encode labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['topic'])
num_labels = len(le.classes_)
print(df[['topic', 'label']].drop_duplicates().sort_values('label'))

               topic  label
6           artifici      0
28        blockchain      1
10            career      2
1             climat      3
5         cybersecur      4
39              data      5
17             digit      6
9               educ      7
22  entrepreneurship      8
7        environment      9
69            health     10
3            histori     11
4             machin     12
31            person     13
2          psycholog     14
0            quantum     15
20             robot     16
18          sociolog     17
8              space     18
14               web     19


In [11]:
# train
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['question_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(len(train_texts), len(val_texts))

160 40


In [12]:
# tokenize text
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(texts, max_length=64):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

train_encodings = tokenize(train_texts)
val_encodings = tokenize(val_texts)
print(train_encodings.keys())

  from .autonotebook import tqdm as notebook_tqdm


KeysView({'input_ids': tensor([[  101, 17935,  3775,  ...,     0,     0,     0],
        [  101,  3813,  2529,  ...,     0,     0,     0],
        [  101,  2025,  2594,  ...,     0,     0,     0],
        ...,
        [  101,  2530,  3519,  ...,     0,     0,     0],
        [  101,  2345,  5770,  ...,     0,     0,     0],
        [  101,  3693,  3095,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])})


In [13]:
# datasets object
import torch

class BertDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BertDataset(train_encodings, train_labels)
val_dataset = BertDataset(val_encodings, val_labels)
print(len(train_dataset), len(val_dataset))

160 40


In [15]:
!pip install --upgrade transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
   ---------------------------------------- 0.0/11.3 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.3 MB 6.4 MB/s eta 0:00:02
   ----- ---------------------------------- 1.6/11.3 MB 3.8 MB/s eta 0:00:03
   ------- -------------------------------- 2.1/11.3 MB 4.1 MB/s eta 0:00:03
   ------------ --------------------------- 3.4/11.3 MB 4.1 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/11.3 MB 4.1 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/11.3 MB 4.1 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/11.3 MB 4.1 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/11.3 MB 4.1 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/11.3 MB 4.1 MB/s eta 0:00:02
   -----


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
# model training
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,3.0856
20,3.0247
30,2.9608
40,2.9046
50,2.8102
60,2.8304


TrainOutput(global_step=60, training_loss=2.9360512097676597, metrics={'train_runtime': 390.6977, 'train_samples_per_second': 1.229, 'train_steps_per_second': 0.154, 'total_flos': 15789214679040.0, 'train_loss': 2.9360512097676597, 'epoch': 3.0})

In [20]:
# model training - 2
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,  # Try 10–20
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,3.0116
20,3.0898
30,2.9946
40,2.9725
50,2.9174
60,2.9823
70,2.8339
80,2.8503
90,2.6066
100,2.6508


TrainOutput(global_step=300, training_loss=1.9493124421437582, metrics={'train_runtime': 3212.7479, 'train_samples_per_second': 0.747, 'train_steps_per_second': 0.093, 'total_flos': 78946073395200.0, 'train_loss': 1.9493124421437582, 'epoch': 15.0})

In [None]:
# evaluation - 1
print(trainer.evaluate())



{'eval_loss': 2.983961582183838, 'eval_runtime': 12.5713, 'eval_samples_per_second': 3.182, 'eval_steps_per_second': 0.398, 'epoch': 3.0}


In [21]:
# evaluation - 2
print(trainer.evaluate())



{'eval_loss': 3.2296111583709717, 'eval_runtime': 23.7273, 'eval_samples_per_second': 1.686, 'eval_steps_per_second': 0.211, 'epoch': 15.0}


In [None]:
# Prediction
            # approach - 1
# test_texts = ["example question text here"]
# test_encodings = tokenize(test_texts)
# with torch.no_grad():
#     outputs = model(**{k: v for k, v in test_encodings.items()})
#     probs = torch.softmax(outputs.logits, dim=1)
#     pred_label = probs.argmax(dim=1).item()
#     pred_topic = le.inverse_transform([pred_label])[0]
#     print("Predicted topic:", pred_topic)


            # approach - 2
# Example: test_texts = ["Your first question here", "Your second question here"]
# test_texts = ["organ choos concern eye stage three point despit senior serv citizen idea u"]
test_texts = ["marriag best film dream wish"]
# Tokenize input (make sure to use the same tokenizer and max_length as in training)
test_encodings = tokenizer(
    test_texts,
    padding='max_length',
    truncation=True,
    max_length=64,
    return_tensors='pt'
)

# Set model to evaluation mode
model.eval()

with torch.no_grad():
    outputs = model(**{k: v for k, v in test_encodings.items()})
    probs = torch.softmax(outputs.logits, dim=1)
    pred_labels = probs.argmax(dim=1).cpu().numpy()
    pred_topics = le.inverse_transform(pred_labels)

for question, topic in zip(test_texts, pred_topics):
    print(f"Question: {question}")
    print(f"Predicted topic: {topic}\n")

Question: marriag best film dream wish
Predicted topic: person

