In [None]:
import transformers
from transformers import TFXLNetModel,XLNetTokenizer, XLNetModel, AdamW, get_linear_schedule_with_warmup
import torch
import re
import sentencepiece as spm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from collections import defaultdict
#from textwrap import wrap
from pylab import rcParams
import tensorflow as tf
from torch import nn, optim
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

In [None]:
pretrained_xlnet = 'xlnet-base-cased'
tokenizer = XLNetTokenizer.from_pretrained(pretrained_xlnet)

def create_xlnet(mname):
    
    word_inputs = tf.keras.Input(shape=(120,), name='word_inputs', dtype='int32')
    xlnet = TFXLNetModel.from_pretrained(mname)
    xlnet_encodings = xlnet(word_inputs)[0]
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(doc_encoding)
    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    return model

xlnet = create_xlnet(pretrained_xlnet)


In [None]:
xlnet.summary()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tweets, labels, test_size=0.15, random_state=196)
y_train = np.asarray(y_train).astype(np.float32)

In [None]:
def get_inputs(tweets, tokenizer, max_len=120):
    inps = [tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=True, add_special_tokens=True) for t in tweets]
    inp_tok = np.array([a['input_ids'] for a in inps])
    ids = np.array([a['attention_mask'] for a in inps])
    segments = np.array([a['token_type_ids'] for a in inps])
    return inp_tok, ids, segments

inp_tok, ids, segments = get_inputs(X_train, tokenizer)

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, min_delta=0.02, restore_best_weights=True),
    tf.keras.callbacks.LearningRateScheduler(warmup, verbose=0),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=1e-6, patience=2, verbose=0, mode='auto', min_delta=0.001, cooldown=0, min_lr=1e-6)
]

In [None]:
hist = xlnet.fit(x=inp_tok, y=y_train, epochs=15, batch_size=16, validation_split=.25, callbacks=callbacks)

In [None]:
pred = xlnet.predict(inp_tok, verbose=True)