In [17]:
import os

os.environ['KMP_DUPLICATE_LIB_OK']='True'

from sklearn.model_selection import train_test_split
from datasets import load_dataset
from datasets import load_metric
import pandas as pd
import numpy as np
import ipywidgets
import evaluate
import torch

file_path = f'{os.getcwd()}/data'

from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification

In [33]:
tokenizer = AutoTokenizer.from_pretrained('lexlms/roberta-base-uncased', lower=True)

def preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

data_files = {"train": f'{file_path}/json/QAZoningTrain.json', "test": f'{file_path}/json/QAZoningTest.json'} # * this is how to load multiple files, need to sklearn train_test_split into two sets first
print(data_files)
QA_dataset = load_dataset('json', data_files=data_files)
print(QA_dataset)

Downloading: 100%|██████████| 385/385 [00:00<00:00, 516kB/s]
Downloading: 100%|██████████| 2.18M/2.18M [00:00<00:00, 18.3MB/s]
Downloading: 100%|██████████| 167/167 [00:00<00:00, 249kB/s]
loading file tokenizer.json from cache at /home/jesusaur/.cache/huggingface/hub/models--lexlms--roberta-base-uncased/snapshots/098511e0b42988cf6b882d3828feab9f58f0e0b7/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/jesusaur/.cache/huggingface/hub/models--lexlms--roberta-base-uncased/snapshots/098511e0b42988cf6b882d3828feab9f58f0e0b7/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/jesusaur/.cache/huggingface/hub/models--lexlms--roberta-base-uncased/snapshots/098511e0b42988cf6b882d3828feab9f58f0e0b7/tokenizer_config.json
Using custom data configuration default-f147e92c9891d68a
Found cached dataset json (/home/jesusaur/.cache/huggingface/datasets/json/default-f147e92c9891d68a/0.0.0/e6070c77f18f01a5ad

{'train': '/data/user/home/jesusaur/cs662-qa-land-dev-law-sys/programs/data/json/QAZoningTrain.json', 'test': '/data/user/home/jesusaur/cs662-qa-land-dev-law-sys/programs/data/json/QAZoningTest.json'}


100%|██████████| 2/2 [00:00<00:00, 903.36it/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 955
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 107
    })
})





In [37]:
tokenized_data = QA_dataset.map(preprocess_function, batched=True)
    
model = AutoModelForSequenceClassification.from_pretrained('lexlms/roberta-base-uncased', num_labels=48)

metric1 = evaluate.load('f1')
metric2 = evaluate.load('accuracy')

training_args = TrainingArguments(output_dir = "test_trainer",
                                  evaluation_strategy = "epoch",
                                  save_strategy = "epoch",
                                  do_train=True,
                                  do_eval=True,
                                  learning_rate=1e-5,
                                  logging_steps=50,
                                  eval_steps=50,
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  num_train_epochs=25,
                                  weight_decay=0.001,)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = metric1.compute(predictions=predictions, references=labels, average='macro')
    accuracy = metric2.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy['accuracy'], "f1": f1['f1']}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics
)

Loading cached processed dataset at /home/jesusaur/.cache/huggingface/datasets/json/default-f147e92c9891d68a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-7145b3aacefc0b95.arrow
Loading cached processed dataset at /home/jesusaur/.cache/huggingface/datasets/json/default-f147e92c9891d68a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-55903fc0f41c026c.arrow
loading configuration file config.json from cache at /home/jesusaur/.cache/huggingface/hub/models--lexlms--roberta-base-uncased/snapshots/098511e0b42988cf6b882d3828feab9f58f0e0b7/config.json
Model config RobertaConfig {
  "_name_or_path": "lexlms/roberta-base-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 1,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"

In [38]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 955
  Num Epochs = 25
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3000
  Number of trainable parameters = 124479792


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.9135,1.480998,0.607477,0.066866
2,1.2562,1.216446,0.672897,0.10779
3,1.0999,1.140747,0.691589,0.153503
4,0.9899,1.078076,0.719626,0.173363
5,0.9173,0.973338,0.757009,0.269557
6,0.8183,0.84029,0.831776,0.400444
7,0.6754,0.735515,0.841121,0.398667
8,0.6392,0.638531,0.850467,0.427619
9,0.5368,0.563233,0.88785,0.507692
10,0.4439,0.511818,0.897196,0.529012


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 107
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-120
Configuration saved in test_trainer/checkpoint-120/config.json
Model weights saved in test_trainer/checkpoint-120/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 107
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-240
Configuration saved in test_trainer/checkpoint-240/config.json
Model weights saved in test_t

Configuration saved in test_trainer/checkpoint-1920/config.json
Model weights saved in test_trainer/checkpoint-1920/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 107
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-2040
Configuration saved in test_trainer/checkpoint-2040/config.json
Model weights saved in test_trainer/checkpoint-2040/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 107
  Batch size = 8
Saving mod

TrainOutput(global_step=3000, training_loss=0.5392564558982849, metrics={'train_runtime': 1410.4466, 'train_samples_per_second': 16.927, 'train_steps_per_second': 2.127, 'total_flos': 6284370917376000.0, 'train_loss': 0.5392564558982849, 'epoch': 25.0})

In [39]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 107
  Batch size = 8


{'eval_loss': 0.2871874272823334,
 'eval_accuracy': 0.9439252336448598,
 'eval_f1': 0.6839506172839507,
 'eval_runtime': 1.9336,
 'eval_samples_per_second': 55.337,
 'eval_steps_per_second': 7.24,
 'epoch': 25.0}

In [65]:
conversion = {'No': 1, 'Yes': 2, "['0 [ft_i]']": 3, "['1 [du/acr_u]']": 4, "['10 [ft_i]']": 5, "['100 [ft_i]']": 6,
 "['10000 [sft_i]']": 7, "['12 [du/acr_u]']": 8, "['12 [u/acr_u]']": 9, "['125 [ft_i]']": 10, "['15 [ft_i]']": 11,
 "['150 [ft_i]']": 12, "['2 [du/acr_u]']": 13, "['20 [ft_i]']": 14, "['20000 [sft_i]']": 15, "['25 [ft_i]']": 16,
 "['30 [ft_i]']": 17, "['35 [ft_i]']": 18, "['35000 [sft_i]']": 19, "['4 [du/acr_u]']": 20, "['40 [ft_i]']": 21,
 "['5 [ft_i]']": 22, "['50 [ft_i]']": 23, "['6 [du/acr_u]']": 24, "['60 [ft_i]']": 25, "['6000 [sft_i]']": 26,
 "['70 [ft_i]']": 27, "['75 [ft_i]']": 28, "['8 [du/acr_u]']": 29, "['80 [ft_i]']": 30, "['90 [ft_i]']": 31,
 "['A1']": 32, "['A2']": 33, "['C1', 'C2', 'C3', 'C4', 'FI1', 'FI2', 'FI3']": 34, "['C1', 'C2', 'C3', 'C4']": 35,
 "['C2', 'C3', 'C4']": 36, "['C3', 'C4']": 37, "['C4']": 38, "['FI1', 'FI2', 'FI3']": 39, "['FI2', 'FI3']": 40,
 "['FI3']": 41, "['R1', 'R2', 'R3', 'C1', 'C2', 'C3', 'C4', 'FI1', 'FI2', 'FI3']": 42,
 "['R1', 'R2', 'R3', 'C1', 'C2', 'C3', 'C4']": 43, "['R1', 'R2', 'R3']": 44, "['R2', 'R3']": 45, "['R3']": 46,
 '[]': 47}

In [76]:
def get_key(d, value):
   return [k for k, v in d.items() if v == value]

In [78]:
prediction = trainer.predict(tokenized_data["test"])
result = (tokenized_data["test"][4]['text'], get_key(conversion, np.argmax(prediction[0][4], axis=-1)), 
          get_key(conversion, prediction[1][4]))
result

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 107
  Batch size = 8


('What is the minimum lot size in the R2a zoning district?',
 ["['6000 [sft_i]']"],
 ["['10000 [sft_i]']"])

In [81]:
prediction = trainer.predict(tokenized_data["test"])
results = (tokenized_data["test"][2]['text'], get_key(conversion, np.argmax(prediction[0][2], axis=-1)), 
          get_key(conversion, prediction[1][2]))
result

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 107
  Batch size = 8


('Are research or testing laboratories allowed in a FI2 zoning district?',
 ['Yes'],
 ['Yes'])