# Imports

The reference is based on the following HuggingFace implementation

https://huggingface.co/docs/transformers/v4.18.0/en/model_doc/xlnet#transformers.XLNetForSequenceClassification

In [1]:
pip install -q datasets

In [2]:
pip install -q transformers

In [3]:
pip install -q Sentencepiece

In [4]:
import numpy as np
import pandas as pd

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
import torch
import sentencepiece
from transformers import XLNetTokenizer, XLNetForSequenceClassification

tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased",num_labels = 2)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

'LABEL_1'

In [None]:
"""inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]"""

In [11]:
train = pd.read_csv('train.tsv',sep='\t')
valid = pd.read_csv('dev.tsv',sep='\t')
test = pd.read_csv('test.tsv',sep='\t')

In [12]:
#from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

#tokenize dataset
def tokenize(t):
  return tokenizer(t, truncation=True, max_length=512, padding='max_length')

def tokenize_df(df):
  tokens = df['comment_text'].map(tokenize)
  df['input_ids'] = [x['input_ids'] for x in tokens]
  df['attention_mask'] = [x['attention_mask'] for x in tokens]
  df['token_type_ids'] = [x['token_type_ids'] for x in tokens]
  return df

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [13]:
train = tokenize_df(train)
valid = tokenize_df(valid)
test = tokenize_df(test)

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# XLNET - Trainer

https://www.kaggle.com/code/harshpraharaj98/text-classification-using-bert-and-xlnet/notebook

In [15]:
import tempfile
import pathlib
import pyarrow as pa
import pyarrow.parquet as pq

In [16]:
import datasets
from torch.utils.data import DataLoader, Dataset
table_train = pa.table({'labels': (list(train["label"])), 'input_ids':list(train["input_ids"]), 'attention_mask':list(train["attention_mask"]),'token_type_ids':list(train["token_type_ids"]) })
table_validate = pa.table({'labels': list(valid["label"]),'input_ids':list(valid["input_ids"]), 'attention_mask':list(valid["attention_mask"]),'token_type_ids':list(valid["token_type_ids"])})
table_test = pa.table({'labels': list(test["label"]),'input_ids':list(test["input_ids"]), 'attention_mask':list(test["attention_mask"]),'token_type_ids':list(test["token_type_ids"])})

#training = datasets.DatasetDict({"labels":list(train_df["labels"]), "text": list(train_df["text"])})
training = datasets.Dataset(table_train)
valid = datasets.Dataset(table_validate)
test = datasets.Dataset(table_test)

MyDataset = datasets.DatasetDict({"train":training,"validation":valid, "test" : test})

In [17]:
from transformers import TrainingArguments
training_args = TrainingArguments(output_dir="test_trainer")

In [18]:
#Load the metric
from datasets import load_metric
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


#Bring in the trainer
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

In [20]:
x = MyDataset["train"]
train_small = x.shuffle(seed=1002).select([i for i in range(1000)])
y = MyDataset["validation"]
valid_small = y.select([i for i in list(range(100))])

In [21]:
from transformers import get_scheduler

num_epochs = 2
num_training_steps = num_epochs * len(train_small)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_small,
    eval_dataset=valid_small,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler)
)

In [25]:
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.725806,0.48
2,No log,0.716683,0.48
3,No log,0.695919,0.48


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=375, training_loss=0.7066698404947916, metrics={'train_runtime': 1502.4394, 'train_samples_per_second': 1.997, 'train_steps_per_second': 0.25, 'total_flos': 854640838656000.0, 'train_loss': 0.7066698404947916, 'epoch': 3.0})

In [26]:
predictions = trainer.predict(test)
print(predictions.predictions.shape, predictions.label_ids.shape)

***** Running Prediction *****
  Num examples = 872
  Batch size = 8


(872, 2) (872,)


In [27]:
import sklearn
pred_final = np.argmax(predictions.predictions, axis=-1)
gt = np.array(MyDataset["test"]["labels"])
acc = sklearn.metrics.accuracy_score(gt,pred_final)

In [28]:
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print("Validation Accuracy is",sklearn.metrics.accuracy_score(gt,pred_final))

print(classification_report(gt, pred_final))
print("\n\n")
print(confusion_matrix(gt, pred_final))

Validation Accuracy is 0.5091743119266054
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       428
           1       0.51      1.00      0.67       444

    accuracy                           0.51       872
   macro avg       0.25      0.50      0.34       872
weighted avg       0.26      0.51      0.34       872




[[  0 428]
 [  0 444]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
