<a href="https://colab.research.google.com/github/nikkizhou/ML/blob/main/MT_Nikki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT for question type classification

In [None]:
%pip install datasets
%pip install transformers

#### 2. Tokenize the Dataset

In [1]:
from transformers import DataCollatorWithPadding,AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import pandas as pd

# 1. Get and reorgnize dataframe
excel_file = 'Categorized_mocks.xlsx'
df = pd.read_excel(excel_file, header=1)
df = df.iloc[:, :18]
df.iloc[3:,3:] = df.iloc[3:,3:].fillna(0)

# 2. load dataset
csv_file = 'temp_dataset.csv'
df.to_csv(csv_file, index=False)
dataset = load_dataset('csv', data_files=csv_file)
dataset['train'] = dataset['train'].select(range(1, len(dataset['train']))) # Remove the Open-Closed row

print(dataset['train'].column_names)




Generating train split: 0 examples [00:00, ? examples/s]

['ID', 'Question', 'Response', 'R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3', 'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV', 'R2_5', 'R2_6']


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from itertools import chain
import torch

# Combine Categories
# df['invitation'] = df[['R2_2B', 'R2_3YN', 'R2_4QP']].sum(axis=1)
# df['directive'] = df[['R2_2B', 'R2_3YN', 'R2_4QP']].sum(axis=1)
# df['option-posing'] = df[['R2_2B', 'R2_3YN', 'R2_4QP']].sum(axis=1)
# df['suggestive'] = df[['R2_2B', 'R2_3YN', 'R2_4QP']].sum(axis=1)

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
label_columns = ['R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3', 'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV', 'R2_5','R2_6']

def tokenize_questions(examples):
    valid_questions = [q if q is not None else '' for q in examples['Question']]
    return tokenizer(valid_questions, padding=True, truncation=True)

# single label
def get_label(examples):
    label = -1
    for col in label_columns:
      try:
        if int(examples[col]) == 1:
          label = label_columns.index(col)
      except (ValueError, TypeError):
        print(f"Error processing column: {col}, column value: {examples[col]}, Question: {examples['Question']}")
        label = -1
    # if label == -1:
    #     print(f"No label found for Question: {examples['Question']}")
    examples['labels'] = torch.tensor(label)
    return examples

# multi label
# def encode_labels(examples):
#     labels = []
#     for col in label_columns:
#         try:
#             labels.append(int(examples[col]))  # Try to parse to int
#         except (ValueError, TypeError):
#             labels.append(0)  # Set to 0 if parsing fails
#     examples['labels'] = labels
#     return examples


# 1. Tokenize the questions
tokenized_dataset = dataset.map(tokenize_questions, batched=True)
# 2. Process labels
processed_datasets = tokenized_dataset.map(get_label, batched=False)

# 3. Remove unnecessary columns
other_columns_to_remove = ['ID', 'Response','Question']
# other_columns_to_remove = ['ID', 'Response']
processed_datasets = processed_datasets.remove_columns(label_columns + other_columns_to_remove)

# 4. Remove rows with no label
print(str(len(processed_datasets['train'])) + " rows including unlabled rows")
processed_datasets = processed_datasets.filter(lambda example: example['labels'] != -1)

print(str(len(processed_datasets['train']))+ " rows with labled rows")
print("Column names: "+ str(processed_datasets['train'].column_names))
print(processed_datasets['train']['labels'][:100])

# Many questions shows label 13, multilabel problem
# HM-HM shows label 1,2,5,13
for i in range(50):
    print(f"Question: {dataset['train'][i]['Question']}, Label: {processed_datasets['train'][i]['labels']}")




Map:   0%|          | 0/2745 [00:00<?, ? examples/s]

Map:   0%|          | 0/2745 [00:00<?, ? examples/s]

Error processing column: R2_2B, column value:  , Question: OKAY THEN WHAT HAPPENED NEXT
2745 rows including unlabled rows


Filter:   0%|          | 0/2745 [00:00<?, ? examples/s]

2715 rows with labled rows
Column names: ['input_ids', 'token_type_ids', 'attention_mask', 'labels']
[0, 13, 0, 13, 13, 13, 13, 1, 13, 13, 2, 13, 13, 1, 13, 1, 13, 13, 1, 13, 1, 13, 3, 5, 4, 13, 5, 4, 5, 5, 5, 0, 13, 5, 5, 13, 13, 0, 10, 0, 13, 13, 2, 13, 1, 13, 1, 13, 1, 1, 13, 13, 13, 2, 4, 4, 2, 5, 3, 1, 5, 4, 1, 5, 3, 5, 13, 4, 4, 13, 3, 5, 5, 13, 5, 5, 13, 0, 0, 13, 5, 2, 0, 13, 4, 13, 4, 13, 13, 4, 13, 2, 13, 2, 13, 1, 13, 1, 13, 1]
Question: JESSICA TELL ME WHAT YOU'VE COME TO TALK TO ME ABOUT TODAY, Label: 0
Question: YES ABOUT CHURCH CAN YOU TELL ME WHEN YOU LAST WENT TO CHURCH EVERYTHING THAT HAPPENED THAT DAY, Label: 13
Question: START FROM THE BEGINNING, Label: 0
Question: YEAH, Label: 13
Question: AH-HA, Label: 13
Question: YEAH, Label: 13
Question: OKAY THEN WHAT HAPPENED NEXT, Label: 13
Question: OKAY, Label: 1
Question: OKAY AND THEN WHAT DID YOU DO, Label: 13
Question: YEAH, Label: 13
Question: HM-HM, Label: 2
Question: OKAY SO TELL ME ABOUT THAT PART, Label: 13
Questi

#### Using Dataset (failed)

In [4]:

# !pip install transformers datasets pandas torch
# import pandas as pd
# from datasets import load_dataset, Dataset
# from torch.utils.data import DataLoader
# from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# # Step 1: Load the dataset
# excel_file = 'Categorized_mocks.xlsx'
# df = pd.read_excel(excel_file, header=1)
# df = df.iloc[:, :18]
# csv_file = 'temp_dataset.csv'
# df.to_csv(csv_file, index=False)
# dataset = load_dataset('csv', data_files=csv_file)

# # Step 2: Define the custom dataset
# class QuestionTypeDataset(Dataset):
#     def __init__(self, dataframe):
#         self.dataframe = dataframe
#         self.label_columns = ['R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3',
#                               'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL',
#                               'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV',
#                               'R2_5', 'R2_6']

#     def process_label(self,item):
#       for col in self.label_columns:
#           if item[col] == 1:
#             return self.label_columns.index(col)
#       return -1  # Return -1 for no label found

#     def __len__(self):
#         return len(self.dataframe)

#     def __getitem__(self, idx):
#         item = self.dataframe[idx] # get item from dataset
#         question = item['Question']
#         label = self.process_label(item)

#         return {'question': question, # Return the question and label
#                 'label': label}

# # Step 3: Tokenization
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# def tokenize_function(examples):
#   return tokenizer(examples['question'], padding=True, truncation=True, max_length=128)

# # Step 4: Split the dataset into training and test sets
# dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

# # Step 5: Tokenize the dataset
# tokenized_dataset = dataset.map(tokenize_function, batched=True,remove_columns=['ID', 'Response'])

# # Step 6: Convert to pytorch Dataset
# train_dataset = QuestionTypeDataset(tokenized_dataset['train'])
# test_dataset = QuestionTypeDataset(tokenized_dataset['test']) # Create a test dataset

# # Step 7: Define the BERT model
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(train_dataset.label_columns))

# # Step 8: Setup Trainer
# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy='epoch',
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     num_train_epochs=3,
# )

# # Step 9: Define data collator
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Use DataCollatorWithPadding

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset, # Add test dataset to the trainer
#     data_collator=data_collator, # Use data collator for padding
# )

# # Step 10: Train the model
# trainer.train()

In [None]:
# from torch.utils.data import Dataset

# class QuestionTypeDataset(Dataset):

#     def __init__(self, data, tokenizer):
#         self.data = data
#         self.tokenizer = tokenizer

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         # print(self.data )
#         #  Dataset({
#         #     features: ['ID', 'Question', 'Response', 'R2-1', 'R2_2B', 'R2_2D', 'R2_2SD', 'R2_3', 'R2_3YN', 'R2_OP', 'R2_4QG', 'R2_4QL', 'R2_4QP', 'R2_4QR', 'R2_4QI', 'R2_4QV', 'R2_5', 'R2_6'],
#         #     num_rows: 2196
#         # })
#         # print(self.data[idx] )
#         # {'ID': 'NA0040-PC', 'Question': 'CAN YOU REMEMBER IF IT WAS IF HIS HAIR WAS LONG OR SHORT', 'Response': 'Short', 'R2-1': 0, 'R2_2B': '0', 'R2_2D': 0, 'R2_2SD': 0, 'R2_3': 0, 'R2_3YN': 1, 'R2_OP': 0, 'R2_4QG': 0, 'R2_4QL': 0, 'R2_4QP': 0, 'R2_4QR': 0, 'R2_4QI': 0, 'R2_4QV': 0, 'R2_5': 0, 'R2_6': 0}

#         item = self.data[idx]
#         label = self.get_label(item)

#         # Tokenize the text
#         question = self.tokenize_questions(item['Question'])

#         # Create a dictionary with input_ids and labels
#         return {'question':question, 'label':label}
#         # return {'input_ids':item['input_ids'],
#         #         'attention_mask':item['attention_mask'],
#         #         'label': self.process_label(item)}


#     def tokenize_questions(self,question):
#         question = question if question is not None else ''
#         # valid_question = [q if q is not None else '' for q in questions]
#         return tokenizer(question, padding=True, truncation=True)


#     def get_label(self,item):
#         for col in label_columns:
#             if item[col] is not None and item[col]=='' and int(item[col]) == 1:
#               return col
#         return ''


# train_dataset = QuestionTypeDataset(dataset['train'], tokenizer)
# eval_dataset = QuestionTypeDataset(dataset['test'], tokenizer)


#### 3. Prepare DataLoader
Convert the tokenized dataset into a PyTorch DataLoader for training and evaluation.

In [5]:
from torch.utils.data import DataLoader

# 1. Split the dataset into training and test sets
processed_datasets = processed_datasets['train'].train_test_split(test_size=0.2, seed=42)

# 2. Get data_collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 3. Get train_dataloader and eval_dataloader
train_dataloader = DataLoader(processed_datasets['train'], shuffle=True, batch_size=4, collate_fn=data_collator)
eval_dataloader = DataLoader(processed_datasets['test'], batch_size=4, collate_fn=data_collator)


#### 4. Set Up the Optimizer and Learning Rate Scheduler

In [7]:
from transformers import AdamW, get_scheduler
import torch.optim as optim

model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=15)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 5. Training Loop

##### 5.1 Training Method 1

In [8]:
import transformers


trainer = transformers.Trainer(
    model=model,
    train_dataset=processed_datasets['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorWithPadding(tokenizer=tokenizer)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
model.config.problem_type = "single_label_classification"
trainer.train()

max_steps is given, it will override any value given in num_train_epochs


  0%|          | 0/200 [00:00<?, ?it/s]

{'loss': 2.667, 'grad_norm': 8.667946815490723, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.01}
{'loss': 2.6866, 'grad_norm': 11.326203346252441, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}
{'loss': 2.6993, 'grad_norm': 6.997357368469238, 'learning_rate': 6e-06, 'epoch': 0.02}
{'loss': 2.5443, 'grad_norm': 8.685699462890625, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.03}
{'loss': 2.583, 'grad_norm': 9.736553192138672, 'learning_rate': 1e-05, 'epoch': 0.04}
{'loss': 2.6942, 'grad_norm': 5.9877424240112305, 'learning_rate': 1.2e-05, 'epoch': 0.04}
{'loss': 2.5588, 'grad_norm': 8.175559043884277, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.05}
{'loss': 2.6223, 'grad_norm': 7.304416656494141, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.06}
{'loss': 2.6514, 'grad_norm': 7.839000701904297, 'learning_rate': 1.8e-05, 'epoch': 0.07}
{'loss': 2.7698, 'grad_norm': 9.896292686462402, 'learning_rate': 2e-05, 'epoch': 0.07}
{'loss': 2.545, 'grad_norm': 8.04

TrainOutput(global_step=200, training_loss=1.2303097104281187, metrics={'train_runtime': 1694.4367, 'train_samples_per_second': 1.889, 'train_steps_per_second': 0.118, 'total_flos': 178739044653312.0, 'train_loss': 1.2303097104281187, 'epoch': 1.4732965009208103})

##### 5.2 Training Method 2

In [1]:
from tqdm.auto import tqdm
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=15)
model.to(device)
model.config.problem_type = "single_label_classification"

progress_bar = tqdm(range(num_training_steps))

# Option 1:
# for epoch in range(num_epochs):
#     model.train()
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()

#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)


# Option 2: Accumulate Gradients
gradient_accumulation_steps = 8 # Example

for epoch in range(num_epochs):
    model.train()
    for i, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss / gradient_accumulation_steps # Normalize loss
        loss.backward()

        if (i+1) % gradient_accumulation_steps == 0: # Only update gradients every gradient_accumulation_steps
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        progress_bar.update(1)



NameError: name 'AutoModelForSequenceClassification' is not defined

#### 6. Evaluation Loop

In [9]:
model.eval()
correct = 0
total = 0
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

with torch.no_grad():
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=-1)
        labels = batch["labels"]
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

print(f"Accuracy: {correct / total:.4f}")


NameError: name 'device' is not defined