In [None]:
#| hide
import pandas as pd
import torch

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

import pandas as pd
import os
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.preprocessing import LabelEncoder
#import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict

from transformers import AutoTokenizer
import matplotlib.pyplot as plt
from dvclive import Live
from dvclive.huggingface import DVCLiveCallback

# Training Model

In [None]:
cleaned_data = '../data/splits'

## Preprocessing

### Define Parameters

In [None]:
KFOLD = 1
TOKENIZER: str = "bert-base-cased"
LEARNING_RATE: float = 5e-5
BATCH_SIZE: int = 8
EPOCHS: int = 2

Read kfold data into dataset

In [None]:
raw_datasets = load_dataset("csv",data_files={'train': [f'{cleaned_data}/train/FAA-{KFOLD}.csv'], 'test': [f'{cleaned_data}/test/FAA-{KFOLD}.csv'],
                                                'val': [f'{cleaned_data}/val/FAA-{KFOLD}.csv']})

In [None]:
model_nm = "bert-base-cased"

Create tokenizer

In [None]:
tokz = AutoTokenizer.from_pretrained(TOKENIZER)

Tokenize inputs

In [None]:
def tok_func(x):
    return tokz(x["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tok_func, batched=True)

Define datasets for training

In [None]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]
full_val_dataset = tokenized_datasets["val"]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokz)

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train and Evaluate Model

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(TOKENIZER, num_labels=7)

In [None]:
training_args = TrainingArguments(
    output_dir="../output/",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    tokenizer=tokz,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

history = trainer.train()

Save model

In [None]:
trainer.save_model("../model/")