# Question Type Classifier Training

This notebook was derived from the Huggingface example here: https://huggingface.co/transformers/custom_datasets.html

In [None]:
!pip install transformers
!pip install pandas
!pip install sklearn
!pip install datasets

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 37.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 1.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 30.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

In [None]:
import torch
import pandas as pd
from transformers import DistilBertTokenizerFast     #docs: https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
print("done")

done


In [None]:
# Read in the data from the CSV file
# https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from google.colab import drive
drive.mount('/content/xdrive', force_remount=True)
df = pd.read_csv("/content/xdrive/MyDrive/qcat_huggingface/data_4.csv")

Mounted at /content/xdrive


In [None]:
# FILTER THE DATA
df = df[~df['best_guess_label'].isnull()] # trim out all rows without a label

# remove classes that have less instances than some threshold
# threshold = 5
# col = 'best_guess_label'
# counts = df[col].value_counts()
# df = df.loc[df[col].isin(counts[counts > threshold].index), :]

labeledData = df

# FILTER: keep just the selected labels
# keep_list = ['qualitative_property_retrieval', 'opinion']
# labeledData = labeledData.loc[df['best_guess_label'].isin(keep_list)]

# FILTER: leave out the selected labels
#filter_out_list = ["opinion", "causal_explanation"]
#labeledData = labeledData.loc[~df['best_guess_label'].isin(filter_out_list)]

labelColumn = 'best_guess_label'
#labelColumn = 'parent_label'

# get lists of text and labels
all_questions = labeledData["question"].tolist()
#all_labels = labeledData["best_guess_label"].tolist()  # use child class labels
all_labels = labeledData[labelColumn].tolist()          #use parent class labels

# sanity check
assert(len(all_questions) == len(all_labels))
print(set(all_labels))
print("NUM CLASSES: " , len(set(all_labels)))
print("NUM QUESTIONS: ", len(all_questions))
labeledData.head(20)

{'set_difference', 'boolean_or', 'boolean_and', 'boolean_retrieval', 'set_retrieval', 'range', 'counting', 'opinion', 'average', 'set_property_satisfaction', 'standard_deviation', 'set_intersection', 'causal_explanation', 'mode', 'set_union', 'qualitative_property_retrieval', 'numeric_comparison', 'mathematical_comparison', 'qualitative_property_multihop_retrieval', 'qualitative_comparison', 'numeric_retrieval', 'correlation', 'superlative', 'arithmetic', 'definitional', 'datetime_comparison', 'median', 'datetime_retrieval'}
NUM CLASSES:  28
NUM QUESTIONS:  858


Unnamed: 0,info,question,best_guess_label,parent_label
0,break_0,what flights are available tomorrow from denve...,set_property_satisfaction,comparison
1,break_1,show me the afternoon flights from washington ...,set_property_satisfaction,comparison
2,break_2,show me the flights from atlanta to baltimore,set_property_satisfaction,comparison
3,break_3,i want a flight from houston to memphis on tue...,qualitative_property_retrieval,retrieval
4,break_4,what are the cheapest one way flights from atl...,superlative,comparison
5,break_5,what ground transportation is available from t...,qualitative_property_multihop_retrieval,retrieval
6,break_6,flight information from san francisco to pitts...,qualitative_property_retrieval,retrieval
7,break_7,what flights are available from san francisco ...,set_property_satisfaction,comparison
8,break_9,i'm traveling from boston to atlanta and i'd l...,qualitative_property_multihop_retrieval,retrieval
9,break_11,what does ff mean,definitional,definitional


In [None]:
# count how many of each label are in the dataset
labeledData[labelColumn].value_counts()

qualitative_property_retrieval             105
opinion                                     57
superlative                                 56
boolean_retrieval                           45
set_property_satisfaction                   43
causal_explanation                          40
definitional                                38
numeric_retrieval                           37
numeric_comparison                          35
qualitative_comparison                      32
boolean_and                                 30
boolean_or                                  30
datetime_retrieval                          28
arithmetic                                  27
set_union                                   25
set_retrieval                               24
qualitative_property_multihop_retrieval     23
set_intersection                            21
set_difference                              20
datetime_comparison                         20
average                                     20
mode         

In [None]:
# Convert the label strings to integers so PyTorch can use them

# make a dict from the set of labels - {label:int} pairs
label_to_int = {label:idx for idx,label in enumerate(set(all_labels))}
int_to_label = {v: k for k, v in label_to_int.items()} # inverse the dict for later

integer_labels = [label_to_int[x] for x in all_labels]
assert(len(integer_labels) == len(all_labels))

print(label_to_int)

{'set_difference': 0, 'boolean_or': 1, 'boolean_and': 2, 'boolean_retrieval': 3, 'set_retrieval': 4, 'range': 5, 'counting': 6, 'opinion': 7, 'average': 8, 'set_property_satisfaction': 9, 'standard_deviation': 10, 'set_intersection': 11, 'causal_explanation': 12, 'mode': 13, 'set_union': 14, 'qualitative_property_retrieval': 15, 'numeric_comparison': 16, 'mathematical_comparison': 17, 'qualitative_property_multihop_retrieval': 18, 'qualitative_comparison': 19, 'numeric_retrieval': 20, 'correlation': 21, 'superlative': 22, 'arithmetic': 23, 'definitional': 24, 'datetime_comparison': 25, 'median': 26, 'datetime_retrieval': 27}


In [None]:
# Split the dataset into train/val/test sets
from sklearn.model_selection import train_test_split

# Splits dataset 70/30 for training/testing, then splits the 30 part 50/50 into validation/test sets.
X_train, X_almost_test, y_train, y_almost_test = train_test_split(all_questions, integer_labels, test_size=0.3, random_state=0, stratify=integer_labels)
X_valid, X_test, y_valid, y_test = train_test_split(X_almost_test, y_almost_test, test_size=0.5, random_state=0, stratify=y_almost_test)

In [None]:
# Initialize the tokenizer and read in the data
# model_name = "bert-base-uncased"
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# Encode the data
train_encodings = tokenizer(X_train, truncation=True, padding=True)
valid_encodings = tokenizer(X_valid, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10c3c92122b827d92eb2d34ce94ee79ba486c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accef

In [None]:
# Build the dataset object
class ATCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ATCDataset(train_encodings, y_train) #training data
val_dataset = ATCDataset(valid_encodings, y_valid)   #validation set
test_dataset = ATCDataset(test_encodings, y_test)    #test set

In [None]:
# Train the model
training_args = TrainingArguments(   # TrainingArguments is from transformers
    output_dir='./results',          # output directory
    num_train_epochs=20,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay (original: 0.01)
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    evaluation_strategy="epoch"
)
training_args.device

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


device(type='cuda', index=0)

In [None]:
# metrics
import numpy as np
from datasets import load_metric
def compute_metrics(eval_pred):
    metric0 = load_metric("accuracy")
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric0.compute(predictions=predictions, references=labels)["accuracy"]
    # precision = metric1.compute(predictions=predictions, references=labels, average=None)["precision"]
    # recall = metric2.compute(predictions=predictions, references=labels, average=None)["recall"]
    # return {"accuracy": accuracy, "precision": precision, "recall": recall}
    return {"accuracy": accuracy}

In [None]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name) # https://stackoverflow.com/questions/66148641/changing-config-and-loading-hugging-face-model-fine-tuned-on-a-downstream-task#:~:text=bert%2Dbase%2Dcased%22-,config%20%3D%20AutoConfig,-.from_pretrained(pretrained_model_name)%0A%0Aid2label
config.label2id = label_to_int
config.id2label = int_to_label

# model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=len(label_to_int))
model = DistilBertForSequenceClassification.from_pretrained(model_name, config=config)


trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)
print("done")

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a

done


In [None]:
trainer.train()

***** Running training *****
  Num examples = 600
  Num Epochs = 20
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 760


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,3.313643,0.046512
2,No log,3.215835,0.124031
3,3.280100,3.077688,0.124031
4,3.280100,2.889818,0.139535
5,3.280100,2.60777,0.294574
6,2.855200,2.313662,0.434109
7,2.855200,1.996064,0.550388
8,1.935500,1.673851,0.604651
9,1.935500,1.44391,0.620155
10,1.935500,1.252007,0.689922


***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
***** Running Evaluation *****
  Num examples = 129
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/check

TrainOutput(global_step=760, training_loss=1.2050063129318387, metrics={'train_runtime': 128.447, 'train_samples_per_second': 93.424, 'train_steps_per_second': 5.917, 'total_flos': 108715047840000.0, 'train_loss': 1.2050063129318387, 'epoch': 20.0})

In [None]:
# Evaluate model on the VALIDATION dataset
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 129
  Batch size = 64


{'epoch': 20.0,
 'eval_accuracy': 0.7131782945736435,
 'eval_loss': 1.3198504447937012,
 'eval_runtime': 0.9724,
 'eval_samples_per_second': 132.665,
 'eval_steps_per_second': 3.085}

In [None]:
# Evaluate the model on the TEST dataset
output = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 129
  Batch size = 64


In [None]:
output.metrics

{'test_accuracy': 0.8062015503875969,
 'test_loss': 0.854979395866394,
 'test_runtime': 0.9337,
 'test_samples_per_second': 138.158,
 'test_steps_per_second': 3.213}

In [None]:
#ANALYZE OUTPUTS
# ideas:
# - see which classes do good/bad (output to csv and look at it. Also do calculations, see % correct)
# - train for longer (more epochs)
# - train on fewer classes (just our favorites)
# - also do Naive Bayes again for comparison

#*** IMPORTANT!!
# it's possible that our classes are too similar to distinguish.
# - you should train on parent classes. Then train separate classifiers for each.
# - do this. it's a good idea.

In [None]:
# Add predicted labels to the test set in a new column. Then export as a new csv.
all_encodings = tokenizer(all_questions, truncation=True, padding=True)
whole_dataset =  ATCDataset(all_encodings, integer_labels)  #FOR TESTING ON ALL DATA

# predictionOutput = trainer.predict(whole_dataset) # this is a PredictionOutput object

# https://stackoverflow.com/a/69374378
from scipy.special import softmax
from numpy import argmax

# whole_dataset
prediction_logits = trainer.predict(val_dataset).predictions # get raw model outputs from PredictionOutput object

# you can softmax the logits first, but it makes no difference when producing a single label
predictions = argmax(prediction_logits, axis=1)
predicted_labels = [int_to_label[p] for p in predictions]

# save predictions as column in new csv file (this works)


# new_df = pd.DataFrame([X_valid, y_valid])
val_labels_str = [int_to_label[p] for p in y_valid]
new_df = pd.DataFrame(list(zip(X_valid, val_labels_str)),
                      columns =['questions', 'best_guess_label'])

# #new_df = labeledData
new_df['predictions'] = predicted_labels
# #https://datascience.stackexchange.com/a/30993
new_df['correct'] = new_df['predictions']==new_df[labelColumn]
new_df['correct_viz'] = new_df['correct'].map({True: '✅', False: '❌'})
new_df.to_csv('/content/xdrive/MyDrive/qcat_huggingface/predictions_debug.csv', index=False)

***** Running Prediction *****
  Num examples = 129
  Batch size = 64


In [None]:
new_df.sample(10)

Unnamed: 0,questions,best_guess_label,predictions,correct,correct_viz
55,How many total yards did JaMarcus Russell's co...,arithmetic,numeric_retrieval,False,❌
40,what did emily bronte die of?,qualitative_property_retrieval,qualitative_property_retrieval,True,✅
19,what is the biggest mountain?,superlative,superlative,True,✅
31,show me the flights from cleveland to memphis,set_property_satisfaction,set_property_satisfaction,True,✅
114,Which are the best engineering fields?,opinion,opinion,True,✅
56,all members of the royal family,set_retrieval,set_retrieval,True,✅
69,who is richard feynman,definitional,qualitative_property_retrieval,False,❌
105,How long can raw and cooked sausage last refri...,numeric_retrieval,counting,False,❌
81,When can I expect my Cognizant confirmation mail?,datetime_retrieval,datetime_retrieval,True,✅
26,which states are south of the iowa line but no...,set_intersection,set_union,False,❌


In [None]:
# Print a report: for each question type, what percent are correct?

print("____________% of validation dataset labeled correctly____________")
for label in set(all_labels):   # loop over question types
    cur_df = new_df.loc[new_df[labelColumn] == label] # get df rows of that type
    total = len(cur_df)
    try:
        num_correct = cur_df["correct"].value_counts()[True]  # if correct, add to total
    except :
        num_correct = 0     #catch error if none are true
    
    print( "{: <40} {: >10} {: >10}".format(*[label, f"{num_correct}/{total}", round(num_correct/total, 2)]) )


____________% of validation dataset labeled correctly____________
correlation                                     0/1        0.0
causal_explanation                              4/6       0.67
boolean_and                                     2/5        0.4
mode                                            3/3        1.0
superlative                                     7/8       0.88
opinion                                         4/9       0.44
set_difference                                  1/1        1.0
set_retrieval                                   1/2        0.5
datetime_retrieval                              3/4       0.75
datetime_comparison                             2/3       0.67
median                                          2/2        1.0
definitional                                    2/3       0.67
boolean_retrieval                               7/7        1.0
qualitative_property_retrieval                13/16       0.81
qualitative_property_multihop_retrieval         2/3 

In [None]:
# Save the model

#save tokenizer? https://stackoverflow.com/a/64552678

path = "/content/xdrive/MyDrive/qcat_huggingface/"
model.save_pretrained(path + "pretrained_model/")
trainer.save_model(path + "pretrained_trainer/")

Configuration saved in /content/xdrive/MyDrive/qcat_huggingface/pretrained_model/config.json
Model weights saved in /content/xdrive/MyDrive/qcat_huggingface/pretrained_model/pytorch_model.bin
Saving model checkpoint to /content/xdrive/MyDrive/qcat_huggingface/pretrained_trainer/
Configuration saved in /content/xdrive/MyDrive/qcat_huggingface/pretrained_trainer/config.json
Model weights saved in /content/xdrive/MyDrive/qcat_huggingface/pretrained_trainer/pytorch_model.bin


In [None]:
# load the saved model and test some input string
# https://github.com/huggingface/transformers/issues/7849#issuecomment-709995286
from transformers import TextClassificationPipeline, BertConfig

# load config file: https://huggingface.co/transformers/v2.9.1/main_classes/configuration.html
# https://huggingface.co/transformers/v2.9.1/main_classes/configuration.html#transformers.PretrainedConfig.from_pretrained

# load model
model = DistilBertForSequenceClassification.from_pretrained(path+"pretrained_model/", num_labels=len(label_to_int), local_files_only=True, id2label=int_to_label, label2id=label_to_int)

# classifier pipeline
# https://discuss.huggingface.co/t/i-have-trained-my-classifier-now-how-do-i-do-predictions/3625/2


pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
# outputs a list of dicts like [[{'label': 'NEGATIVE', 'score': 0.0001223755971295759},  {'label': 'POSITIVE', 'score': 0.9998776316642761}]]


loading configuration file /content/xdrive/MyDrive/qcat_huggingface/pretrained_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "correlation",
    "1": "causal_explanation",
    "2": "boolean_and",
    "3": "mode",
    "4": "superlative",
    "5": "opinion",
    "6": "set_difference",
    "7": "set_retrieval",
    "8": "datetime_retrieval",
    "9": "datetime_comparison",
    "10": "median",
    "11": "definitional",
    "12": "boolean_retrieval",
    "13": "qualitative_property_retrieval",
    "14": "qualitative_property_multihop_retrieval",
    "15": "range",
    "16": "numeric_comparison",
    "17": "numeric_retrieval",
    "18": "arithmetic",
    "19": "mathematical_comparison",
    "20": "set_property_satisfaction",
    "21": "counting",
 

In [None]:
answers = pipe("when was Elon Musk born?")
answers

# prediction_logits = pipe("I love this movie!")

# # you can softmax the logits first, but it makes no difference when producing a single label
# prediction = argmax(prediction_logits, axis=1)
# predicted_label = int_to_label[p]
# predicted_label

[[{'label': 'correlation', 'score': 0.00019069426343776286},
  {'label': 'causal_explanation', 'score': 0.0006956280558370054},
  {'label': 'boolean_and', 'score': 0.00017970586486626416},
  {'label': 'mode', 'score': 0.00011937258386751637},
  {'label': 'superlative', 'score': 0.00047503397217951715},
  {'label': 'opinion', 'score': 0.00020326283993199468},
  {'label': 'set_difference', 'score': 0.0001147850343841128},
  {'label': 'set_retrieval', 'score': 0.00012628895638044924},
  {'label': 'datetime_retrieval', 'score': 0.9931279420852661},
  {'label': 'datetime_comparison', 'score': 0.00048031439655460417},
  {'label': 'median', 'score': 0.00015297251229640096},
  {'label': 'definitional', 'score': 0.00013775295519735664},
  {'label': 'boolean_retrieval', 'score': 0.0002684938081074506},
  {'label': 'qualitative_property_retrieval', 'score': 0.0007812971016392112},
  {'label': 'qualitative_property_multihop_retrieval',
   'score': 0.0003690363955684006},
  {'label': 'range', 'scor