In [None]:
!pip install transformers
!pip install sentencepiece

In [None]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
import re
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import defaultdict
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn.utils.rnn import pad_sequence


BATCH_SIZE = 5
MAX_LEN = 500
EPOCHS = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
class EssaysDataset(Dataset):
    def __init__(self, texts, target, max_len, tokenizer):
        self.texts = texts
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        target = self.target[item]

        encoding = self.tokenizer.encode_plus(text,
                                              add_special_tokens=True,
                                              max_length=self.max_len,
                                              truncation=True,
                                              return_token_type_ids=False,
                                              pad_to_max_length=False,
                                              return_attention_mask=True,
                                              return_tensors='pt')

        input_ids = pad_sequences(encoding['input_ids'],
                                  dtype=torch.Tensor,
                                  maxlen=self.max_len,
                                  truncating="post",
                                  padding="post")
        input_ids = input_ids.astype(dtype='int64')
        input_ids = torch.tensor(input_ids)

        attention_mask = pad_sequences(encoding['attention_mask'],
                                       dtype=torch.Tensor,
                                       maxlen=self.max_len,
                                       truncating="post",
                                       padding="post")
        attention_mask = attention_mask.astype(dtype='int64')
        attention_mask = torch.tensor(attention_mask)

        return {'text': text,
                'input_ids': input_ids,
                'attention_mask': attention_mask.flatten(),
                'target': torch.tensor(target, dtype=torch.long)}


def create_data_loader(df, target, tokenizer, max_len, batch_size):
    ds = EssaysDataset(texts=df['TEXT'].to_numpy(),
                       target=df[target].to_numpy(),
                       max_len=max_len,
                       tokenizer=tokenizer)

    return DataLoader(ds, batch_size=batch_size, num_workers=2)


def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    acc, losses = [], []

    for d in data_loader:
        input_ids = d["input_ids"].reshape((-1, MAX_LEN)).to(device)
        attention_mask = d["attention_mask"].to(device)
        target = d["target"].to(device)

        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=target)
        loss, logits = outputs.loss, outputs.logits

        _, prediction = torch.max(logits, dim=1)
        target = target.cpu().detach().numpy()
        prediction = prediction.cpu().detach().numpy()

        accuracy = accuracy_score(target, prediction)
        acc.append(accuracy)
        losses.append(loss.item())

        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.5)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(acc), np.mean(losses)


def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    acc, losses, f1 = [], [], []

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].reshape((-1, MAX_LEN)).to(device)
            attention_mask = d["attention_mask"].to(device)
            target = d["target"].to(device)

            outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=target)
            loss, logits = outputs.loss, outputs.logits

            _, prediction = torch.max(logits, dim=1)
            target = target.cpu().detach().numpy()
            prediction = prediction.cpu().detach().numpy()

            accuracy = accuracy_score(target, prediction)
            p, r, f, s = precision_recall_fscore_support(target, prediction, average='binary')
            acc.append(accuracy)
            f1.append(f)
            losses.append(loss.item())

    return np.mean(acc), np.mean(losses), np.mean(f1)


In [None]:
# AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
df = pd.read_csv('datasets/essays.csv', encoding='latin-1')

traits = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
transformation = {'n': 0, 'y': 1}
df = df.replace({'cEXT': transformation,
                 'cNEU': transformation,
                 'cAGR': transformation,
                 'cCON': transformation,
                 'cOPN': transformation})


df_train = df.sample(frac=0.9)
df_test = df.drop(df_train.index)
df_val = df_train.sample(frac=0.1)
df_train = df_train.drop(df_val.index)

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

train_data_loader = create_data_loader(df_train, traits[4], tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, traits[4], tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, traits[4], tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)
model = model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
                                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
                               ]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

history = defaultdict(list)
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model,
                                        train_data_loader,
                                        optimizer,
                                        device,
                                        scheduler,
                                        len(df_train))
    print(f'Train loss {train_loss} Train accuracy {train_acc}')

    val_acc, val_loss, val_f1 = eval_model(model,
                                           val_data_loader,
                                           device,
                                           len(df_val))
    print(f'Valid loss {val_loss:.4f} Valid accuracy {val_acc:.4f} Valid F1 {val_f1:.4f}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

test_acc, test_loss, test_f1 = eval_model(model,
                                          test_data_loader,
                                          device,
                                          len(df_test))
print(f'Test loss {test_loss:.4f} Test accuracy {test_acc:.4f} Test F1 {test_f1:.4f}')

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.bias', 'sequence_summary.summary.weight', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Epoch 1/5
----------
Train loss 0.685509972423315 Train accuracy 0.5631666666666666


  _warn_prf(average, modifier, msg_start, len(result))


Valid loss 0.6915 Valid accuracy 0.5733 Valid F1 0.2096

Epoch 2/5
----------
Train loss 0.6552249873802066 Train accuracy 0.6116666666666667
Valid loss 0.7199 Valid accuracy 0.6311 Valid F1 0.3638

Epoch 3/5
----------
Train loss 0.6046022529155016 Train accuracy 0.6771666666666667
Valid loss 0.8549 Valid accuracy 0.6044 Valid F1 0.3527

Epoch 4/5
----------
Train loss 0.5216176130063832 Train accuracy 0.7501666666666666
Valid loss 1.2500 Valid accuracy 0.5956 Valid F1 0.3423

Epoch 5/5
----------
Train loss 0.46042319727595893 Train accuracy 0.8008333333333333
Valid loss 1.1861 Valid accuracy 0.5933 Valid F1 0.4589



  _warn_prf(average, modifier, msg_start, len(result))


Test loss 1.0094 Test accuracy 0.6360 Test F1 0.4882
