In [None]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import datetime
plt.style.use('seaborn-whitegrid')


# setting device
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('Device in use: {} \n'.format(device))

# setting some parameters
batch_size = 16
n_epochs = 3
num_labels = 5
chunk_size = 300
         

# using 'engine = python' for the second file because it's large 
poem_train_df = pd.read_csv(r'./poems.csv', engine='python')

# extracting unique values in Category column
unique_values = poem_train_df['Category'].unique() 

# map to numerical labels for each category (using index in unique_values)
class_mapping = {value: index for index, value in enumerate(unique_values)}

# applying class_mapping to Category column
poem_train_df['Category'] = poem_train_df['Category'].map(class_mapping)


def chunk_text(input_string, chunk_size):
    '''Takes a string and splits it into several strings if its length 
    exceeds chunk_size words, 512 tokens ~380 words, leaving some sapce to 
    account for special tokens.'''
    
    words = input_string.split()
    chunk_words = [words[i:i + chunk_size] for i in range(0, len(words), chunk_size)]
    chunked_strings = [' '.join(chunk) for chunk in chunk_words]
    
    return chunked_strings


# creating a new dataframe with the chunked text
chunked_poem_train_df = []
chunks = []

for _, row in poem_train_df.iterrows():
    chunks = chunk_text(row['Poem'], chunk_size)
    for chunk in chunks:
        chunked_poem_train_df.append({'Poem': chunk, 'Category': row['Category']})
        
chunked_poem_train_df = pd.DataFrame(chunked_poem_train_df)
    
    

# shuffling dataset
chunked_poem_train_df = chunked_poem_train_df.sample(frac=1, 
                                     random_state=666).reset_index(drop=True)       

# renaming columns to what the HF trainer expects
chunked_poem_train_df = chunked_poem_train_df.rename(columns={
    'Poem': 'text',
    'Category': 'label'
})


# print (len(poem_train_df)) # OG len is 25000
# print (len(chunked_poem_train_df)) # New len is 29639

pretrained_model_name = 'intfloat/multilingual-e5-small'

# tokenizer used needs to match pretrained model 
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

# loading pretrained model
transformer = (AutoModelForSequenceClassification.from_pretrained(pretrained_model_name, 
                                                                  num_labels=num_labels, 
                                                                  ignore_mismatched_sizes=True)).to(device)


max_context_length = transformer.config.max_position_embeddings

def tokenize(batch):
    '''padding fills all strings, to match largest string size in the batch. 
    truncation removes anything longer than context size, just in case.
    '''
    return tokenizer(batch['text'], padding='max_length', truncation=True,
                     max_length=max_context_length)          


full_set = Dataset.from_pandas(chunked_poem_train_df)
tokenized_dataset = full_set.map(tokenize, batched=True, 
                                     batch_size=batch_size)
    

'''using HF Dataset method "select" to split the data, which was already 
shuffled above (in documentation this select takes a list, but apparently
works with range() too)'''
train_set = tokenized_dataset.select(range(int(0.8*len(full_set))))
eval_set = tokenized_dataset.select(range(int(0.8*len(full_set)), len(full_set)))

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}



logging_steps = len(train_set) // batch_size // 10
training_args  = TrainingArguments(output_dir= r'./checkpoints',
                                    num_train_epochs=n_epochs,
                                    learning_rate=1e-5,
                                    warmup_steps=1000,
                                    lr_scheduler_type='linear',
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    load_best_model_at_end=True,
                                    metric_for_best_model='f1',
                                    weight_decay=0.01,
                                    evaluation_strategy='steps',
                                    disable_tqdm=False,
                                    logging_steps=logging_steps,
                                    eval_steps=logging_steps,
                                    save_steps=logging_steps,
                                    log_level='error',
                                    report_to='tensorboard')





trainer = Trainer(model=transformer,
                    args=training_args,
                    train_dataset=train_set,
                    eval_dataset=eval_set,
                    compute_metrics=compute_metrics,
                    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])



print('start time is: {} \n'.format(datetime.datetime.now()))


trainer.train()


print('Finsih time is: {} \n'.format(datetime.datetime.now()))