In [None]:
# !pip install transformers -q
# !pip install tensorflow-addons -q
# !pip install sentencepiece -q

[K     |████████████████████████████████| 2.6 MB 9.0 MB/s 
[K     |████████████████████████████████| 636 kB 68.7 MB/s 
[K     |████████████████████████████████| 3.3 MB 54.0 MB/s 
[K     |████████████████████████████████| 895 kB 63.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 8.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 7.5 MB/s 
[?25h

In [None]:
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
from tqdm import tqdm, trange

import tensorflow as tf

import torch
from torch.nn import BCEWithLogitsLoss
from sklearn import preprocessing
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

from transformers import AdamW, XLNetTokenizer, TFXLNetModel
import tensorflow_addons as tfa

DEVICE_NAME = tf.test.gpu_device_name()
if DEVICE_NAME != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(DEVICE_NAME))

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
N_GPU = torch.cuda.device_count()
torch.cuda.get_device_name(0)

LABEL_IDS = {'FAVOUR': 0, 'AGAINST': 1, 'NEUTRAL': 2}
N_LABELS = len(LABEL_IDS)

Found GPU at: /device:GPU:0


In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
BATCH_SIZE = 2
NUM_EPOCHS = 4
MODEL = 'xlnet-large-cased'

## XL-Net Model

In [None]:
def XLNetForMultiLabelClassification(model):
    """ Creates the model. It is composed of the XLNet main block and then
    a classification head its added
    """
    # Define token ids as inputs
    word_inputs = tf.keras.Input(shape=(512,), name='word_inputs', dtype='int32')

    # Call XLNet model
    xlnet = TFXLNetModel.from_pretrained(model)
    xlnet_encodings = xlnet(word_inputs)[0]

    # CLASSIFICATION HEAD 
    # Collect last step from last hidden state (CLS)
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    # Apply dropout for regularization
    doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)
    # Final output 
    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(doc_encoding)

    # Compile model
    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy', 
                                                                                                    tf.keras.metrics.Precision(), 
                                                                                                    tf.keras.metrics.Recall(),
                                                                                                    tfa.metrics.F1Score(num_classes=1, average='macro',threshold=0.5)])
    return model

def get_inputs(tweets, tokenizer, max_len=512):
    """Get tensors from text using the tokenizer provided"""
    inputs = [tokenizer.encode_plus(tweet, max_length=max_len, pad_to_max_length=True, add_special_tokens=True) for tweet in tweets]
    input_ids = np.array([input['input_ids'] for input in inputs])
    attn_masks = np.array([input['attention_mask'] for input in inputs])
    token_type_ids = np.array([input['token_type_ids'] for input in inputs])
    return input_ids, attn_masks, token_type_ids

def warmup(epoch, lr):
    """Used for increasing the learning rate slowly, this tends to achieve better convergence.
    However, as we are finetuning for few epoch it's not crucial.
    """
    return max(lr +1e-6, 2e-5)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, min_delta=0.02, restore_best_weights=True),
    tf.keras.callbacks.LearningRateScheduler(warmup, verbose=0),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=1e-6, patience=2, verbose=0, mode='auto', min_delta=0.001, cooldown=0, min_lr=1e-6)
]

In [None]:
XLNet_tokenizer = XLNetTokenizer.from_pretrained(MODEL, do_lower_case=True)

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/761 [00:00<?, ?B/s]

In [None]:
xlnet_model = XLNetForMultiLabelClassification(MODEL)
xlnet_model.summary()

Downloading:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

Some layers from the model checkpoint at xlnet-large-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFXLNetModel were initialized from the model checkpoint at xlnet-large-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_inputs (InputLayer)     [(None, 512)]             0         
_________________________________________________________________
tfxl_net_model (TFXLNetModel TFXLNetModelOutput(last_h 360268800 
________

### Load data

In [None]:
# Read the dataset
stance_df = pd.read_csv('labelled_stance_data.csv')

labelEncoder = preprocessing.LabelEncoder()

tweets = stance_df.tweet.values.tolist()
stance_labels = labelEncoder.fit_transform(stance_df['stance'])

In [None]:
# Train - test split
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweets, stance_labels, test_size=0.2, random_state=7)

### Train the model

In [None]:
train_data, train_masks, train_token_ids = get_inputs(train_tweets, XLNet_tokenizer)
train_data, valid_data, train_labels, valid_labels = train_test_split(train_data, train_labels, test_size=0.15, random_state = 7)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
hist = xlnet_model.fit(x=train_data, y=train_labels, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, callbacks=callbacks, validation_data=(valid_data, valid_labels))

Epoch 1/4

In [None]:
train_loss = hist.history['loss']
val_loss   = hist.history['val_loss']
train_acc  = hist.history['accuracy']
val_acc    = hist.history['val_accuracy']

In [None]:
plt.style.use('seaborn')

plt.figure(figsize=(800, 500))
plt.plot(train_acc, color='green', label='Training Accuracy')
plt.plot(val_acc,color='red', label='Validation Accuracy')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Testing

In [None]:
test_data, test_masks, test_token_type_ids = get_inputs(test_tweets, XLNet_tokenizer)

In [None]:
pred_labels = xlnet_model.predict(test_data, verbose=True)

In [None]:
print('Testing Accuracy:', accuracy_score(test_labels, np.array(pred_labels.flatten() >= .5, dtype='int')))

In [None]:
xlnet_model.save_weights("xlnet_stance.h5")