In [1]:
!pip install transformers -U
!pip install datasets
!pip install nvidia-ml-py3 
!pip install humanize
!pip install torch -U 
!pip install transformers[sentencepiece]
!pip install -q git+https://github.com/gmihaila/ml_things.git
!pip list | grep -E 'transformers|tokenizers'

In [2]:
import os
import re
import math
import torch
import pandas as pd
import altair as alt
from pathlib import Path
from tqdm.notebook import tqdm
from datasets import load_dataset
from altair.utils.data import to_values
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (
    XLNetConfig,
    XLNetTokenizer,
    XLNetLMHeadModel,
    XLNetForSequenceClassification,
    Trainer,
    TrainingArguments,
    AdamW,
    get_linear_schedule_with_warmup,
    set_seed,
    DataCollatorForPermutationLanguageModeling,
    )

os.environ["WANDB_MODE"] = 'offline'
os.environ["WANDB_DISABLED"] = "true"

In [3]:
class Pretrain:
    def __init__(self, file_path, model_path, checkpoint, output_dir, num_steps):
        self.file_path = file_path
        self.model_path = model_path
        self.checkpoint = checkpoint
        self.output_dir = output_dir
        self.num_steps = num_steps

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
    
    def tokenize_function(self, examples):
        # Remove empty lines
        examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
        return self.tokenizer(examples["text"], truncation=True, max_length=512)

    def group_texts(self, examples):
        block_size = 32
        # Concatenate all texts.
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder,
        # we could add padding if the model supported it instead of this drop,
        # you can customize this part to your needs.
        total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result
    
    def get_datacollator(self):
        datasets = load_dataset("text", data_files={"train": self.file_path})
        
        tokenized_datasets = datasets.map(self.tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
        self.lm_datasets = tokenized_datasets.map(
            self.group_texts,
            batched=True,
            )

        self.data_collator = DataCollatorForPermutationLanguageModeling(tokenizer=self.tokenizer)

    def train(self):
        os.makedirs(self.output_dir, exist_ok=True)

        self.tokenizer = XLNetTokenizer.from_pretrained(self.model_path)
        
        self.get_datacollator()
        
        if self.checkpoint == None:
            config = XLNetConfig(
                n_layer=12,
                d_model=768,
                n_head=12,
                d_inner=4096,
                dropout=0.1,
                dropatt=0.1,
                bi_data=True,
                model_type='xlnet',
                vocab_size=self.tokenizer.vocab_size,
                bos_token_id=self.tokenizer.bos_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
            )
            self.model = XLNetLMHeadModel(config=config)
        else:
            self.model = XLNetLMHeadModel.from_pretrained(self.checkpoint)
        self.model.to(self.device)

        self.training_args = TrainingArguments(
            output_dir=self.output_dir,
            overwrite_output_dir=True,
            do_train=True,
            num_train_epochs=1,
            per_device_train_batch_size=118,
            learning_rate=4e-4,
            weight_decay=0.01,
            max_steps=self.num_steps,
            adam_epsilon=1e-6,
            warmup_steps=40_000,
            save_steps=1011,
            save_total_limit=2,
            prediction_loss_only=True,
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            data_collator=self.data_collator,
            train_dataset=self.lm_datasets["train"],
        )

        self.trainer.train(resume_from_checkpoint=self.checkpoint)
        self.trainer.save_model(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)

    def showresult(self):
        loss_history = {'train_loss': []}
        for log_history in self.trainer.state.log_history:
            if 'loss' in log_history.keys():
                # Deal with trianing loss.
                loss_history['train_loss'].append(log_history['loss'])
        
        source = pd.DataFrame.from_dict(loss_history)
        source = source.set_index(pd.RangeIndex(len(loss_history['train_loss']), name='Steps'))
        source = source.reset_index().melt('Steps', var_name='category', value_name='Value')
        print(source)

        alt.Chart(source).mark_line().encode(
            x='Steps',
            y='Value',
            color='category',
        )
        
        
    def show_result(self):
        # Keep track of train loss.
        loss_history = {'train_loss':[]}

        # Keep track of train and evaluate perplexity.
        # This is a metric useful to track for language models.
        perplexity_history = {'train_perplexity':[]}

        # Loop through each log history.
        for log_history in self.trainer.state.log_history:
            if 'loss' in log_history.keys():
                # Deal with trianing loss.
                loss_history['train_loss'].append(log_history['loss'])
                perplexity_history['train_perplexity'].append(math.exp(log_history['loss']))

        # Plot Losses.
        plot_dict(loss_history, start_step=self.training_args.logging_steps, 
                  step_size=self.training_args.logging_steps, use_title='Loss', 
                  use_xlabel='Train Steps', use_ylabel='Values', magnify=2)

        print()

        # Plot Perplexities.
#         plot_dict(perplexity_history, start_step=self.training_args.logging_steps, 
#                   step_size=self.training_args.logging_steps, use_title='Perplexity', 
#                   use_xlabel='Train Steps', use_ylabel='Values', magnify=2)

In [4]:
class Finetune():
    def __init__(self, model_path, train_file, valid_file, batch_size):
        self.model_path = model_path
        self.train_file = train_file
        self.valid_file = valid_file
        self.batch_size = batch_size
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.max_length = 512

        self.labels_ids = {'negatif': 0, 'positif': 1}
        self.n_labels = len(self.labels_ids)
        
        print('Loading configuraiton...')
        self.model_config = XLNetConfig.from_pretrained(pretrained_model_name_or_path=self.model_path, 
                                                  num_labels=self.n_labels)

        # Get model's tokenizer.
        print('Loading tokenizer...')
        self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path=self.model_path)

        # Get the actual model.
        print('Loading model...')
        self.model = XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path=self.model_path, 
                                                                   config=self.model_config)

        # Load model to defined device.
        self.model.to(self.device)
        print('Model loaded to `%s`'%self.device)
        
        print('Dealing with Train...')
        # Create pytorch dataset.
        train_dataset = CommentsDataset(path=self.train_file, 
                                       use_tokenizer=self.tokenizer, 
                                       labels_ids=self.labels_ids,
                                       max_sequence_len=self.max_length)
        print('Created `train_dataset` with %d examples!'%len(train_dataset))

        # Move pytorch dataset into dataloader.
        self.train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        print('Created `train_dataloader` with %d batches!'%len(self.train_dataloader))

        print()

        print('Dealing with Validation...')
        # Create pytorch dataset.
        valid_dataset =  CommentsDataset(path=self.valid_file, 
                                       use_tokenizer=self.tokenizer, 
                                       labels_ids=self.labels_ids,
                                       max_sequence_len=self.max_length)
        print('Created `valid_dataset` with %d examples!'%len(valid_dataset))

        # Move pytorch dataset into dataloader.
        self.valid_dataloader = DataLoader(valid_dataset, batch_size=self.batch_size, shuffle=False)
        print('Created `valid_dataloader` with %d batches!'%len(self.valid_dataloader))
    
    def train(self, optimizer, scheduler):
        # Tracking variables.
        predictions_labels = []
        true_labels = []
        # Total loss for this epoch.
        total_loss = 0

        # Put the model into training mode.
        self.model.train()

        # For each batch of training data...
        for batch in tqdm(self.train_dataloader, total=len(self.train_dataloader)):

            # Add original labels - use later for evaluation.
            true_labels += batch['labels'].numpy().flatten().tolist()

            # move batch to device
            batch = {k:v.type(torch.long).to(self.device) for k,v in batch.items()}

            self.model.zero_grad()

            outputs = self.model(**batch)

            loss, logits = outputs[:2]

            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            optimizer.step()

            scheduler.step()

            logits = logits.detach().cpu().numpy()

            predictions_labels += logits.argmax(axis=-1).flatten().tolist()

        avg_epoch_loss = total_loss / len(self.train_dataloader)

        return true_labels, predictions_labels, avg_epoch_loss

    def validation(self):
        # Tracking variables
        predictions_labels = []
        true_labels = []
        #total loss for this epoch.
        total_loss = 0

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        self.model.eval()

        # Evaluate data for one epoch
        for batch in tqdm(self.valid_dataloader, total=len(self.valid_dataloader)):
            # add original labels
            true_labels += batch['labels'].numpy().flatten().tolist()

            # move batch to device
            batch = {k:v.type(torch.long).to(self.device) for k,v in batch.items()}

            with torch.no_grad():        

                outputs = self.model(**batch)

                loss, logits = outputs[:2]

                logits = logits.detach().cpu().numpy()

                total_loss += loss.item()

                predict_content = logits.argmax(axis=-1).flatten().tolist()

                predictions_labels += predict_content

            avg_epoch_loss = total_loss / len(self.valid_dataloader)

        return true_labels, predictions_labels, avg_epoch_loss
    
    def showresult(self, data, length):
        source = pd.DataFrame.from_dict(data)
        source = source.set_index(pd.RangeIndex(length, name='Epochs'))
        source = source.reset_index().melt('Epochs', var_name='category', value_name='Value')
        print(source)

        alt.Chart(source).mark_line().encode(
            x='Epochs',
            y='Value',
            color='category',
        )
    
    def show_result(self, all_loss, all_acc):
        # Plot loss curves.
        plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'], magnify=0.1)

        # Plot accuracy curves.
        plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'], magnify=0.1)

In [5]:
class CommentsDataset(Dataset):
    def __init__(self, path, use_tokenizer, labels_ids, max_sequence_len=None):

        # Check if path exists.
        if not os.path.exists(path):
            # Raise error if path is invalid.
            raise ValueError('Invalid `path` variable! Needs to be a directory')
        # Check max sequence length.
        max_sequence_len = use_tokenizer.max_len if max_sequence_len is None else max_sequence_len
        texts = []
        labels = []
        print('Reading partitions...')
        df = pd.read_csv(path, on_bad_lines='error')
        df = df.reset_index()  # make sure indexes pair with number of rows
        for index, row in df.iterrows():
            text = self.clean_text(row['text'])
            texts.append(text)
            labels.append(labels_ids[row['label']])

        # Number of exmaples.
        self.n_examples = len(labels)
        # Use tokenizer on texts. This can take a while.
        print('Using tokenizer on all texts. This can take a while...')
        self.inputs = use_tokenizer(
            texts, 
            add_special_tokens=True, 
            truncation=True, 
            padding=True, 
            return_tensors='pt', 
            max_length=max_sequence_len
        )
        # Get maximum sequence length.
        self.sequence_len = self.inputs['input_ids'].shape[-1]
        print('Texts padded or truncated to %d length!' % self.sequence_len)
        # Add labels.
        self.inputs.update({'labels':torch.tensor(labels)})
        print('Finished!\n')

        return
    
    def clean_text(self, line):
        # cleaning wild char except maybe a repetition word
        cleanline = re.sub(r"[^\w\s\-]", " ", line).lower()

        # cleaning number
        cleanline = re.sub('[0-9]', ' ', cleanline)

        # cleaning non indonesian character
        cleanline = re.sub(r"[^(a-z)+\s{1}]", "", cleanline)

        # cleaning whitespaces
        cleanline = re.sub(r"\s+", " ", cleanline)
        return cleanline

    def __len__(self):
        return self.n_examples

    def __getitem__(self, item):
        return {key: self.inputs[key][item] for key in self.inputs.keys()}

In [1]:
class Main:
    def main(self):
        
        # pre-training
        pretrain = Pretrain(
            file_path="../dataset/training/wiki.txt",
            model_path="../pretrained-xlnet-model", 
            checkpoint=None,
            output_dir="../pretrained-xlnet-model", 
            num_steps=500000
        )
        
        pretrain.train()
        pretrain.showresult()
        
        # fine-tuning
        finetune = Finetune(
            model_path='../pretrained-xlnet-model', 
            train_file='../dataset/training/train.csv', 
            valid_file='../dataset/training/valid.csv', 
            batch_size=16
        )
        
        epochs = 3
        
        optimizer = AdamW(finetune.model.parameters(),
                  weight_decay = 0.01,
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-6 # args.adam_epsilon  - default is 1e-8.
                  )

        total_steps = len(finetune.train_dataloader) * epochs

        # Create the learning rate scheduler.
        scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                    num_warmup_steps = 0, # Default value in run_glue.py
                                                    num_training_steps = total_steps)

        # Store the average loss after each epoch so we can plot them.
        all_loss = {'train_loss':[], 'val_loss':[]}
        all_acc = {'train_acc':[], 'val_acc':[]}

        # Loop through each epoch.
        print('Epoch')
        for epoch in tqdm(range(epochs)):
            print()
            print('Training on batches...')
            # Perform one full pass over the training set.
            train_labels, train_predict, train_loss = finetune.train(optimizer, scheduler)
            train_acc = accuracy_score(train_labels, train_predict)

            # Get prediction form model on validation data. 
            print('Validation on batches...')
            valid_labels, valid_predict, val_loss = finetune.validation()
            val_acc = accuracy_score(valid_labels, valid_predict)

            # Print loss and accuracy values to see how training evolves.
            print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, val_loss, train_acc, val_acc))
            print()

            # Store the loss value for plotting the learning curve.
            all_loss['train_loss'].append(train_loss)
            all_loss['val_loss'].append(val_loss)
            all_acc['train_acc'].append(train_acc)
            all_acc['val_acc'].append(val_acc)
            
        finetune.show_result(all_loss, all_acc)

        finetune.showresult(all_loss, len(all_loss['train_loss']))
        finetune.showresult(all_acc, len(all_acc['train_acc']))
        
        
        finetune.model.save_pretrained('../xlnetmodel/model-1')
        finetune.tokenizer.save_pretrained('../xlnetmodel/model-1')

if __name__ == '__main__':
    main = Main()
    main.main()

NameError: name 'Pretrain' is not defined