<a href="https://colab.research.google.com/github/nikkizhou/ML/blob/main/MT_Nikki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT for question type classification

In [None]:
%pip install datasets
%pip install transformers

#### 2. Tokenize the Dataset

In [88]:
from transformers import DataCollatorWithPadding,AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import pandas as pd
import torch

# 1. Get and reorgnize dataframe
excel_file = 'Categorized_mocks.xlsx'
df = pd.read_excel(excel_file, header=1)
df = df.iloc[:, :18]
df.iloc[3:,3:] = df.iloc[3:,3:].fillna(0)
df = df.drop(df.index[0])  #Remove the Open-Closed row

# 2. Process labels
label_columns = ['R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3', 'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV', 'R2_5','R2_6']
df[label_columns] = df[label_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

def find_first_label(row):
    for col in label_columns:
        if row[col] == 1:
            return label_columns.index(col)
    return None  # Return None if no 1 is found 

df['labels'] = df.apply(find_first_label, axis=1)
df = df[df['labels'].notnull()] # Remove rows where 'label' is None
df = df[['Question','labels']]
df['labels'] = df['labels'].astype(int)
print(df.columns)

# 2. load dataset
csv_file = 'temp_dataset.csv'
df.to_csv(csv_file, index=False)
dataset = load_dataset('csv', data_files=csv_file)

print(dataset['train'].column_names)

Index(['Question', 'labels'], dtype='object')


Generating train split: 0 examples [00:00, ? examples/s]

['Question', 'labels']


In [90]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from itertools import chain
import torch

# Combine Categories
# 'R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3', 'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV', 'R2_5','R2_6'
df['invitation'] = df[['R2-1']].sum(axis=1)
df['directive'] = df[['R2_2B', 'R2_2D', 'R2_2SD']].sum(axis=1)
df['option-posing'] = df[['R2_3', 'R2_3YN', 'R2_OP', ]].sum(axis=1)
df['suggestive'] = df[['R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV']].sum(axis=1)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_questions(examples):
    return tokenizer(examples['Question'], padding=True, truncation=True)

# 1. Tokenize the questions
tokenize_datasets = dataset.map(tokenize_questions, batched=True)
processed_datasets = tokenize_datasets.remove_columns(['Question'])

print("Column names: "+ str(processed_datasets['train'].column_names))

for i in range(50):
    print(f"Question: {dataset['train'][i]['Question']}, Label: {processed_datasets['train'][i]['labels']}" + ", Type:"+str(type(processed_datasets['train'][i]['labels'])))


Column names: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
Question: JESSICA TELL ME WHAT YOU'VE COME TO TALK TO ME ABOUT TODAY, Label: 0, Type:<class 'int'>
Question: YES ABOUT CHURCH CAN YOU TELL ME WHEN YOU LAST WENT TO CHURCH EVERYTHING THAT HAPPENED THAT DAY, Label: 5, Type:<class 'int'>
Question: START FROM THE BEGINNING, Label: 0, Type:<class 'int'>
Question: YEAH, Label: 13, Type:<class 'int'>
Question: AH-HA, Label: 13, Type:<class 'int'>
Question: YEAH, Label: 13, Type:<class 'int'>
Question: OKAY, Label: 13, Type:<class 'int'>
Question: OKAY AND THEN WHAT DID YOU DO, Label: 1, Type:<class 'int'>
Question: YEAH, Label: 13, Type:<class 'int'>
Question: HM-HM, Label: 13, Type:<class 'int'>
Question: OKAY SO TELL ME ABOUT THAT PART, Label: 2, Type:<class 'int'>
Question: YEAH, Label: 13, Type:<class 'int'>
Question: OKAY, Label: 13, Type:<class 'int'>
Question: OKAY AND WHAT HAPPENED NEXT, Label: 1, Type:<class 'int'>
Question: YEAH, Label: 13, Type:<class 'int'>


#### Using Dataset (failed)

In [4]:

# !pip install transformers datasets pandas torch
# import pandas as pd
# from datasets import load_dataset, Dataset
# from torch.utils.data import DataLoader
# from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# # Step 1: Load the dataset
# excel_file = 'Categorized_mocks.xlsx'
# df = pd.read_excel(excel_file, header=1)
# df = df.iloc[:, :18]
# csv_file = 'temp_dataset.csv'
# df.to_csv(csv_file, index=False)
# dataset = load_dataset('csv', data_files=csv_file)

# # Step 2: Define the custom dataset
# class QuestionTypeDataset(Dataset):
#     def __init__(self, dataframe):
#         self.dataframe = dataframe
#         self.label_columns = ['R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3',
#                               'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL',
#                               'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV',
#                               'R2_5', 'R2_6']

#     def process_label(self,item):
#       for col in self.label_columns:
#           if item[col] == 1:
#             return self.label_columns.index(col)
#       return -1  # Return -1 for no label found

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         item = self.dataframe[idx] # get item from dataset
#         question = item['Question']
#         label = self.process_label(item)

#         return {'question': question, # Return the question and label
#                 'label': label}

# # Step 3: Tokenization
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# def tokenize_function(examples):
#   return tokenizer(examples['question'], padding=True, truncation=True, max_length=128)

# # Step 4: Split the dataset into training and test sets
# dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

# # Step 5: Tokenize the dataset
# tokenized_dataset = dataset.map(tokenize_function, batched=True,remove_columns=['ID', 'Response'])

# # Step 6: Convert to pytorch Dataset
# train_dataset = QuestionTypeDataset(tokenized_dataset['train'])
# test_dataset = QuestionTypeDataset(tokenized_dataset['test']) # Create a test dataset

# # Step 7: Define the BERT model
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_dataset.label_columns))

# # Step 8: Setup Trainer
# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy='epoch',
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     num_train_epochs=3,
# )

# # Step 9: Define data collator
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Use DataCollatorWithPadding

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset, # Add test dataset to the trainer
#     data_collator=data_collator, # Use data collator for padding
# )

# # Step 10: Train the model
# trainer.train()

In [None]:
# from torch.utils.data import Dataset

# class QuestionTypeDataset(Dataset):

#     def __init__(self, data, tokenizer):
#         self.data = data
#         self.tokenizer = tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         # print(self.data )
#         #  Dataset({
#         #     features: ['ID', 'Question', 'Response', 'R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3', 'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV', 'R2_5', 'R2_6'],
#         #     num_rows: 2196
#         # })
#         # print(self.data[idx] )
#         # {'ID': 'NA0040-PC', 'Question': 'CAN YOU REMEMBER IF IT WAS IF HIS HAIR WAS LONG OR SHORT', 'Response': 'Short', 'R2-1': 0, 'R2_2B': '0', 'R2_2D': 0, 'R2_2SD': 0, 'R2_3': 0, 'R2_3YN': 1, 'R2_OP': 0, 'R2_4QG': 0, 'R2_4QL': 0, 'R2_4QP': 0, 'R2_4QR': 0, 'R2_4QI': 0, 'R2_4QV': 0, 'R2_5': 0, 'R2_6': 0}

#         item = self.data[idx]
#         label = self.get_label(item)

#         # Tokenize the text
#         question = self.tokenize_questions(item['Question'])

#         # Create a dictionary with input_ids and labels
#         return {'question':question, 'label':label}
#         # return {'input_ids':item['input_ids'],
#         #         'attention_mask':item['attention_mask'],
#         #         'label': self.process_label(item)}


#     def tokenize_questions(self,question):
#         question = question if question is not None else ''
#         # valid_question = [q if q is not None else '' for q in questions]
#         return tokenizer(question, padding=True, truncation=True)


#     def get_label(self,item):
#         for col in label_columns:
#             if item[col] is not None and item[col]=='' and int(item[col]) == 1:
#               return col
#         return ''


# train_dataset = QuestionTypeDataset(dataset['train'], tokenizer)
# eval_dataset = QuestionTypeDataset(dataset['test'], tokenizer)


#### 3. Prepare DataLoader
Convert the tokenized dataset into a PyTorch DataLoader for training and evaluation.

In [91]:
from torch.utils.data import DataLoader

# 1. Split the dataset into training and test sets
processed_datasets = processed_datasets['train'].train_test_split(test_size=0.2, seed=42)

# 2. Get data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Get train_dataloader and eval_dataloader
train_dataloader = DataLoader(processed_datasets['train'], shuffle=True, batch_size=4, collate_fn=data_collator)
eval_dataloader = DataLoader(processed_datasets['test'], batch_size=4, collate_fn=data_collator)


#### 4. Set Up the Optimizer and Learning Rate Scheduler

In [92]:
from transformers import AdamW, get_scheduler
import torch.optim as optim

model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=15)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 5. Training Loop

##### 5.1 Training Method 1
More abstraction. Easier to use.
Gradient accumulation, FP16 training, warmup steps, and learning rate scheduling are handled automatically by setting options in TrainingArguments.
Supports distributed training (multi-GPU or TPU) with minimal extra code.

In [93]:
import transformers

trainer = transformers.Trainer(
    model=model,
    train_dataset=processed_datasets['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorWithPadding(tokenizer=tokenizer)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.config.problem_type = "single_label_classification"
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 2.7772, 'grad_norm': 7.573460578918457, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 2.8166, 'grad_norm': 8.805163383483887, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}
{'loss': 2.8177, 'grad_norm': 7.14774227142334, 'learning_rate': 6e-06, 'epoch': 0.02}
{'loss': 2.7193, 'grad_norm': 8.024197578430176, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.03}
{'loss': 2.6928, 'grad_norm': 9.03019905090332, 'learning_rate': 1e-05, 'epoch': 0.04}
{'loss': 2.7144, 'grad_norm': 7.009313583374023, 'learning_rate': 1.2e-05, 'epoch': 0.04}
{'loss': 2.5889, 'grad_norm': 9.484980583190918, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.05}
{'loss': 2.7085, 'grad_norm': 7.110121726989746, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.06}
{'loss': 2.6835, 'grad_norm': 8.49590015411377, 'learning_rate': 1.8e-05, 'epoch': 0.07}
{'loss': 2.7413, 'grad_norm': 9.887998580932617, 'learning_rate': 2e-05, 'epoch': 0.07}
{'loss': 2.5998, 'grad_norm': 7.8470

TrainOutput(global_step=200, training_loss=1.1620804461091756, metrics={'train_runtime': 2509.4355, 'train_samples_per_second': 1.275, 'train_steps_per_second': 0.08, 'total_flos': 178985740059072.0, 'train_loss': 1.1620804461091756, 'epoch': 1.4732965009208103})

##### 5.2 Training Method 2
More Flexibility and Control
Easier to implement custom features like custom loss functions, dynamic learning rates, or non-standard update schedules.

In [32]:
from tqdm.auto import tqdm
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=15)
model.to(device)
model.config.problem_type = "single_label_classification"

progress_bar = tqdm(range(num_training_steps))

# Option 2: Accumulate Gradients
gradient_accumulation_steps = 8 # Example

for epoch in range(num_epochs):
    model.train()
    for i, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss / gradient_accumulation_steps # Normalize loss
        loss.backward()

        if (i+1) % gradient_accumulation_steps == 0: # Only update gradients every gradient_accumulation_steps
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.update(1)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1629 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`Question` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

#### 6. Evaluation Loop

In [94]:
model.eval()
correct = 0
total = 0
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        labels = batch["labels"]
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print(f"Accuracy: {correct / total:.4f}")


Accuracy: 0.7901
