In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("../../")
import os
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.eval.classification import eval_classification
from utils_nlp.common.timer import Timer
from utils_nlp.models.xlnet.common import Language, Tokenizer
from utils_nlp.models.xlnet.sequence_classification import XLNetSequenceClassifier

In [None]:
DATA_FOLDER = "../../../temp"
CACHE_DIR="../../../temp"
LABEL_COL = "genre"
TEXT_COL = "sentence1"
LANGUAGE = Language.ENGLISHCASED
MAX_SEQ_LENGTH = 384
BATCH_SIZE = 8
NUM_GPUS = 0
NUM_EPOCHS = 1
TRAIN_SIZE = 0.6

LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.0
ADAM_EPSILON = 1e-8
WARMUP_STEPS = 0

DEBUG = True

In [None]:
df = load_pandas_df(DATA_FOLDER, "train")
df = df[df["gold_label"]=="neutral"]  # get unique sentences

if DEBUG:
    inds = random.sample(range(len(df.index)), 100)
    df = df.iloc[inds]

In [None]:
# split
df_train, df_test = train_test_split(df, train_size = TRAIN_SIZE)

# encode labels
label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(df_train[LABEL_COL])
labels_test = label_encoder.transform(df_test[LABEL_COL])
label_list = label_encoder.classes_

num_labels = len(np.unique(labels_train))

In [None]:
df_train = list(df_train[TEXT_COL])
df_test =list(df_test[TEXT_COL])

In [None]:
tokenizer = Tokenizer(LANGUAGE)

train_input_ids, train_input_mask, train_segment_ids = tokenizer.preprocess_classification_tokens(df_train, MAX_SEQ_LENGTH)
test_input_ids, test_input_mask, test_segment_ids = tokenizer.preprocess_classification_tokens(df_test, MAX_SEQ_LENGTH)

In [None]:
classifier = XLNetSequenceClassifier(
    language=LANGUAGE, num_labels=num_labels, cache_dir=CACHE_DIR
)

In [None]:
with Timer() as t:
    classifier.fit(
        token_ids=train_input_ids,
        input_mask=train_input_mask,
        token_type_ids=train_segment_ids,
        labels=labels_train,    
        num_gpus=NUM_GPUS,        
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,    
        verbose=True,
    )    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

In [None]:
preds = classifier.predict(
    token_ids=test_input_ids,
    input_mask=test_input_mask,
    token_type_ids=test_segment_ids,
    num_gpus=NUM_GPUS,
    batch_size=BATCH_SIZE,
    probabilities=False
)

In [None]:
preds

In [None]:
labels_test

In [None]:
print(classification_report(labels_test, preds, target_names=label_encoder.classes_))