In [None]:
# Install transformers library.
!pip install -q git+https://github.com/huggingface/transformers.git
# Install helper functions.
!pip install -q git+https://github.com/gmihaila/ml_things.git

In [None]:
import io
import os
import re
import torch
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (XLNetConfig, 
                          XLNetForSequenceClassification, 
                          XLNetTokenizer, AdamW, 
                          get_linear_schedule_with_warmup,
                          set_seed,
                          )

epochs = 3
batch_size = 50
max_length = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name_or_path = '../input/my-indo-xlnet-model'

labels_ids = {'negatif': 0, 'positif': 1}
n_labels = len(labels_ids)

In [None]:
def clean_text(line):
    # cleaning wild char except maybe a repetition word
    cleanline = re.sub(r"[^\w\s\-]", " ", line).lower()

    # cleaning number
    cleanline = re.sub('[0-9]', ' ', cleanline)

    # cleaning non indonesian character
    cleanline = re.sub(r"[^(a-z)+\s{1}]", "", cleanline)

    # cleaning whitespaces
    cleanline = re.sub(r"\s+", " ", cleanline)
    return cleanline

In [None]:
class CommentsDataset(Dataset):
    def __init__(self, path, use_tokenizer, labels_ids, max_sequence_len=None):

        # Check if path exists.
        if not os.path.exists(path):
            # Raise error if path is invalid.
            raise ValueError('Invalid `path` variable! Needs to be a directory')
        # Check max sequence length.
        max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len
        texts = []
        labels = []
        print('Reading partitions...')
        df = pd.read_csv(path, on_bad_lines='error')
        df = df.reset_index()  # make sure indexes pair with number of rows
        for index, row in df.iterrows():
            text = clean_text(row['text'])
            texts.append(text)
            labels.append(labels_ids[row['label']])

        # Number of exmaples.
        self.n_examples = len(labels)
        # Use tokenizer on texts. This can take a while.
        print('Using tokenizer on all texts. This can take a while...')
        self.inputs = use_tokenizer(
            texts, 
            add_special_tokens=True, 
            truncation=True, 
            padding=True, 
            return_tensors='pt', 
            max_length=max_sequence_len
        )
        # Get maximum sequence length.
        self.sequence_len = self.inputs['input_ids'].shape[-1]
        print('Texts padded or truncated to %d length!' % self.sequence_len)
        # Add labels.
        self.inputs.update({'labels':torch.tensor(labels)})
        print('Finished!\n')

        return

    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {key: self.inputs[key][item] for key in self.inputs.keys()}



def train(dataloader, optimizer_, scheduler_, device_):
    # Use global variable for model.
    global model

    # Tracking variables.
    predictions_labels = []
    true_labels = []
    # Total loss for this epoch.
    total_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    for batch in tqdm(dataloader, total=len(dataloader)):

        # Add original labels - use later for evaluation.
        true_labels += batch['labels'].numpy().flatten().tolist()

        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        model.zero_grad()

        outputs = model(**batch)

        loss, logits = outputs[:2]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

        logits = logits.detach().cpu().numpy()

        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    avg_epoch_loss = total_loss / len(dataloader)

    return true_labels, predictions_labels, avg_epoch_loss



def validation(dataloader, device_):
    # Use global variable for model.
    global model
    
    # Tracking variables
    predictions_labels = []
    true_labels = []
    #total loss for this epoch.
    total_loss = 0
    
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    
    # Evaluate data for one epoch
    for batch in tqdm(dataloader, total=len(dataloader)):
        # add original labels
        true_labels += batch['labels'].numpy().flatten().tolist()

        # move batch to device
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        with torch.no_grad():        

            outputs = model(**batch)

            loss, logits = outputs[:2]

            logits = logits.detach().cpu().numpy()

            total_loss += loss.item()

            predict_content = logits.argmax(axis=-1).flatten().tolist()

            predictions_labels += predict_content

        avg_epoch_loss = total_loss / len(dataloader)
    
    return true_labels, predictions_labels, avg_epoch_loss

In [None]:
# Get model configuration.
print('Loading configuraiton...')
model_config = XLNetConfig.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                          num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)

# Get the actual model.
print('Loading model...')
model = XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, 
                                                           config=model_config)

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

In [None]:
print('Dealing with Train...')
# Create pytorch dataset.
train_dataset = CommentsDataset(path='../input/comments/train.csv', 
                               use_tokenizer=tokenizer, 
                               labels_ids=labels_ids,
                               max_sequence_len=max_length)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

# Move pytorch dataset into dataloader.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

print('Dealing with Validation...')
# Create pytorch dataset.
valid_dataset =  CommentsDataset(path='../input/comments/valid.csv', 
                               use_tokenizer=tokenizer, 
                               labels_ids=labels_ids,
                               max_sequence_len=max_length)
print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

# Move pytorch dataset into dataloader.
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
print('Created `valid_dataloader` with %d batches!'%len(valid_dataloader))

print()

print('Dealing with Test...')
# Create pytorch dataset.
test_dataset =  CommentsDataset(path='../input/comments/test.csv', 
                               use_tokenizer=tokenizer, 
                               labels_ids=labels_ids,
                               max_sequence_len=max_length)
print('Created `test_dataset` with %d examples!'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print('Created `test_dataloader` with %d batches!'%len(test_dataloader))

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  weight_decay = 0.01,
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-6 # args.adam_epsilon  - default is 1e-8.
                  )

# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives 
# us the number of batches.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}

# Loop through each epoch.
print('Epoch')
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')
    # Perform one full pass over the training set.
    train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_acc = accuracy_score(train_labels, train_predict)

    # Get prediction form model on validation data. 
    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_acc = accuracy_score(valid_labels, valid_predict)

    # Print loss and accuracy values to see how training evolves.
    print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
    print()

    # Store the loss value for plotting the learning curve.
    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(val_loss)
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)

# Plot loss curves.
plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'], magnify=0.1)

# Plot accuracy curves.
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'], magnify=0.1)

In [None]:
os.makedirs('my-model', exist_ok=True)
model.save_pretrained('my-model')
tokenizer.save_pretrained('my-model')

In [None]:
# Get prediction form model on validation data. This is where you should use
# your test data.
true_labels, predictions_labels, avg_epoch_loss = validation(test_dataloader, device)

# Create the evaluation report.
evaluation_report = classification_report(
    true_labels, 
    predictions_labels, 
    labels=list(labels_ids.values()), 
    target_names=list(labels_ids.keys())
)
# Show the evaluation report.
print(evaluation_report)

# Plot confusion matrix.
plot_confusion_matrix(y_true=true_labels, y_pred=predictions_labels, 
                      classes=list(labels_ids.keys()), normalize=True, 
                      magnify=0.1,
                      );