# Text Classification Using XLNet and 20-Newsgroup Dataset

Install dependencies

In [None]:
!pip install -q tensorflow_gpu

In [None]:
!pip install -q transformers

In [None]:
!pip install -q sklearn

In [None]:
!pip install -q matplotlib

Import dependencies

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer, TFXLNetLMHeadModel, TFXLNetForMultipleChoice
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

Check available gpu. Just in case, always good to see if CUDA is integrating properly.

In [None]:
tf.config.experimental.list_physical_devices('GPU')

In [None]:
device_name = tf.test.gpu_device_name()
if device_name: 
    print(device_name)

else:
   print("GPU not found")

Create train, val, and test sets

In [None]:
train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) #recommended setting for text classificaiton.
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) #removes noise from actual text.
print(train.filenames.shape)
print(test.filenames.shape)

This is a way I can reduce file amount easily while keeping the data stratified.

In [None]:
train_data, val_data, train_class, val_class = train_test_split(train.data, train.target, test_size=.90, stratify=train.target)

In [None]:
train_data, val_data, train_class, val_class = train_test_split(train_data, train_class, test_size=.25, stratify=train_class)

In [None]:
test_data, _, test_class, _ = train_test_split(test.data, test.target, test_size=.95, stratify=test.target)

Look at sample. We can see the excess headers, subject lines, and emails have been removed

Now we can use only the text to create embeddings

In [None]:
print(len(train_data))
print(len(val_data))
print(len(test_data))

Check that the proportions are indeed similar across train and val

In [None]:
unique, counts = np.unique(train_class, return_counts=True)
plt.bar(unique, counts)
unique, counts = np.unique(val_class, return_counts=True)
plt.bar(unique, counts)

plt.title('Class Frequency')
plt.xlabel('Class')
plt.ylabel('Frequency')

plt.show()

Define function for padding.

In [None]:
#         if key == "token_type_ids":
#             print(arr[:, :-1])
#             arr = tf.concat([arr[:, :-1], tf.zeros([1,missing_pad_len], tf.int32)], 1)
#             new_class_tf.constant([[2]], dtype=tf.int32)
#             arr = tf.concat([arr, tf.constant([[2]], dtype=tf.int32)])
#         else:

In [None]:
def pad_data(arr, max_len):
    arr_len = len(arr)
    missing_pad_len = max_len - arr_len
    arr = arr + ([0] * missing_pad_len)
    return arr

In [None]:
def to_tensor(arr, dtype=tf.int32):
    out_tensor = tf.squeeze(tf.convert_to_tensor(arr, dtype=dtype))
    return out_tensor

In [None]:
def map_to_dict(input_ids, attention_mask, token_type_ids):
  return {
      "input_ids": input_ids,
      "attention_mask": attention_mask,
      "token_type_ids": token_type_ids,
  }

Define function for encoding data into the correct format for XLNet.

In [None]:
def encode_data(data, max_len, batch_size):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)  # no lower case native model.
    batch_count=0
    for i, news in enumerate(data):
        id_batch = []
        token_batch = []
        attention_batch = []
        encoded_data = tokenizer.encode_plus(news,
                                            add_special_tokens=True,
                                            max_length=max_len,
                                            truncation=True,
                                            return_token_type_ids=True,
                                            return_attention_mask=True)
        

        padded_ids = pad_data(encoded_data['input_ids'], max_len)  # need so that padding is appended to end of vector.
        id_batch.append(padded_ids)
        
        padded_token_type = pad_data(encoded_data['token_type_ids'], max_len)
        token_batch.append(padded_token_type)
        
        padded_attention = pad_data(encoded_data['attention_mask'], max_len)
        attention_batch.append(padded_attention)
        
        batch_count+=1
        if batch_count == batch_size or i<=0:
            input_ids.append(id_batch)
            token_type_ids.append(token_batch)
            attention_mask.append(attention_batch)
            batch_count=0
    tensor_dataset = [to_tensor(input_ids), to_tensor(attention_mask), to_tensor(token_type_ids)]
#     tensor_dataset = tf.data.Dataset.from_tensor_slices((input_ids,
#                                                           attention_mask,
#                                                           token_type_ids))
    return tensor_dataset

In [None]:
XLNet_max_len = 300
batch_size = 4  # batch size and max len taken from https://github.com/zihangdai/xlnet#:~:text=For%20the%20best%20performance%2C%20XLNet,with%20GPUs%20is%20quite%20difficult.
encoded_train = encode_data(train_data, XLNet_max_len, batch_size)
encoded_val = encode_data(val_data, XLNet_max_len, batch_size)
encoded_test = encode_data(test.data, XLNet_max_len, batch_size)

In [None]:
print(encoded_train)

Create fine tuning function

In [None]:
def fine_tune_model(train_enc, val_enc):  # could have used ktrain for all, but where's the fun in that ;)
    
    model = TFXLNetLMHeadModel.from_pretrained('xlnet-base-cased')  # using this model to just create langauge model (LM).
#     l_rate = 3e-5
#     n_epochs = 2
    
    model = model(input_ids=train_enc[0], attention_mask=train_enc[1], token_type_ids=train_enc[2])
    return model

In [None]:
model = fine_tune_model(encoded_train, encoded_val)