# Traing BERT sentiment analysis model

In [3]:
import nuclio

## Environment

In [4]:
%nuclio cmd -c python -m pip install transformers==3.0.1 torch

## Function

In [5]:
#nuclio: start-code

In [31]:
import os
import pandas as pd
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup, BertModel
import torch
import torch.nn as nn
from torch.utils import data
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
from collections import defaultdict
from mlrun.artifacts import PlotArtifact, ChartArtifact
from mlrun.datastore import DataItem
from mlrun import MLClientCtx

In [7]:
class BertSentimentClassifier(nn.Module):
    def __init__(self, pretrained_model, n_classes):
        super(BertSentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.dropout = nn.Dropout(p=0.2)
        self.out_linear = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        _, pooled_out = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        out = self.dropout(pooled_out)
        out = self.out_linear(out)
        return self.softmax(out)

In [8]:
class ReviewsDataset(data.Dataset):
    def __init__(self, review, target, tokenizer, max_len):
        self.review = review
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self, item):
        review = str(self.review[item])
        enc = self.tokenizer.encode_plus(
            review,
            max_length=self.max_len,
            add_special_tokens=True,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True)
        
        return {'input_ids': enc['input_ids'].squeeze(0), 
                'attention_mask': enc['attention_mask'].squeeze(0),
                'targets': torch.tensor(self.target[item], dtype=torch.long)}

In [9]:
def score_to_sents(score):
    if score <= 2:
        return 0
    if score == 3:
        return 1
    return 2

In [10]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    dataset = ReviewsDataset(
        review=df.content.to_numpy(),
        target=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len)
    
    return data.DataLoader(dataset, batch_size=batch_size, num_workers=4)

In [11]:
def train_epoch(
    model,
    data_loader,
    criterion,
    optimizer,
    scheduler,
    n_examples,
    device
):
    model.train()
    losses = []
    correct_preds = 0
    
    for i, d in enumerate(data_loader):
        if i % 50 == 0:
            print(f'batch {i + 1}/ {len(data_loader)}')
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        _, pred = torch.max(outputs, dim=1)
        
        loss = criterion(outputs, targets)
        correct_preds += torch.sum(pred == targets)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    return correct_preds.double() / n_examples, np.mean(losses)

In [12]:
def eval_model(
    model,
    data_loader,
    criterion,
    n_examples,
    device
):
    print('evaluation')
    model = model.eval()
    correct_preds = 0
    losses = []
    
    with torch.no_grad():
        for i, d in enumerate(data_loader):
            if i % 50 == 0:
                print(f'batch {i + 1}/ {len(data_loader)}')
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, pred = torch.max(outputs, dim=1)

            loss = criterion(outputs, targets)
            correct_preds += torch.sum(pred == targets)
            losses.append(loss.item())
    return correct_preds.double() / n_examples, np.mean(losses)

In [13]:
def eval_on_test(model_path, data_loader, n_examples):
    model = BertSentimentClassifier(3).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    correct_preds = 0

    with torch.no_grad():
        for i, d in enumerate(data_loader):
            if i % 50 == 0:
                print(f'batch {i + 1}/ {len(data_loader)}')

            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, pred = torch.max(outputs, dim=1)
            correct_preds += torch.sum(pred == targets)
    return correct_preds.double() / n_examples

In [14]:
def train_sentiment_analysis_model(context: MLClientCtx, 
                                   reviews_dataset: DataItem,
                                   pretrained_model: str = 'bert-base-cased', 
                                   models_dir: str = 'models',
                                   model_filename: str = 'bert_sentiment_analysis_model.pt',
                                   MAX_LEN: int = 128,
                                   BATCH_SIZE: int = 16,
                                   EPOCHS: int = 50,
                                   random_state: int = 42):

    # Check for CPU or GPU 
    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
    base_path = os.path.abspath('..')
    plots_path = os.path.join(base_path, 'plots')
    context.logger.info(f'Using {device}')
    
    models_basepath = os.path.join(context.artifact_path, models_dir)
    os.makedirs(models_basepath, exist_ok=True)
    model_filepath = os.path.join(models_basepath, model_filename)
    
    # Get dataset
    df = reviews_dataset.as_df()
    
    # Save score plot
    df = df[['content', 'score']]
    sns.distplot(df.score)
    context.log_artifact(PlotArtifact(f"reviews-scores", body=plt.gcf()),
                         target_path=f"{plots_path}/reviews-scores.html")
    
    # Turn scores to sentiment label
    df['sentiment'] = df['score'].apply(score_to_sents)
    
    # Load bert tokenizer
    tokenizer = BertTokenizer.from_pretrained(pretrained_model)
    
    # Tokenize reviews
    lens = [len(tokenizer.encode(df.loc[review]['content'])) for review in df.index]
    max_length = max(lens)
    context.logger.info(f'longest review: {max_length}')
    plt.clf()
    sns.distplot(lens)
    context.log_artifact(PlotArtifact(f"reviews-lengths", body=plt.gcf()),
                         target_path=f"{plots_path}/reviews-lengths.html")
    
    # Create training and validation datasets
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=random_state)
    df_dev, df_test = train_test_split(df_test, test_size = 0.5, random_state=random_state)
    
    # Create dataloaders for all datasets
    train_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
    dev_loader = create_data_loader(df_dev, tokenizer, MAX_LEN, BATCH_SIZE)
    test_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
    
    # Load the bert sentiment classifier base
    model = BertSentimentClassifier(pretrained_model, n_classes=3).to(device)
    
    # training
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    criterion = nn.CrossEntropyLoss().to(device)
    
    history = defaultdict(list)
    best_acc = 0

    context.logger.info('Started training the model')
    for epoch in range(EPOCHS):
        train_acc, train_loss = train_epoch(
            model,
            train_loader,
            criterion,
            optimizer,
            scheduler,
            len(df_train),
            device
        )
        
        dev_acc, dev_loss = eval_model(
            model,
            dev_loader,
            criterion,
            len(df_dev),
            device
        )

        # Append results to history
        history['train_acc'].append(train_acc)
        history['train_loss'].append(train_loss)
        history['dev_acc'].append(dev_acc)
        history['dev_loss'].append(dev_loss)
        context.logger.info(f'Epoch: {epoch + 1}/{EPOCHS}: Train loss: {train_loss}, accuracy: {train_acc} Val loss: {dev_loss}, accuracy: {dev_acc}')

        if dev_acc > best_acc:
            torch.save(model.state_dict(), model_filepath)
            context.logger.info(f'Updating model, Current models is better then the previous one ({best_acc} vs. {dev_acc}).')
            best_acc = dev_acc
        chart = ChartArtifact('summary.html')
        chart.header = ['epoch', 'accuracy', 'val_accuracy', 'loss', 'val_loss']
        for i in range(EPOCHS):
            chart.add_row([i + 1, history.history['train_acc'][i],
                           history['train_loss'][i],
                           history['dev_acc'][i],
                           history['dev_loss'][i]])
        summary = context.log_artifact(chart, target_path=f"{plots_path}/training-summary.html")
        
        test_acc = eval_on_test('model.pt', test_loader, len(df_test))
        context.logger.info(f'Received {test_acc} on test dataset')
        context.log_model(key='bert_sentiment_analysis_model',
                          model_file=model_filepath,
                          metrics={'train_accuracy': train_acc,
                                   'train_loss': train_loss,
                                   'best_acccuracy': best_acc,
                                   'validation_accuracy': dev_acc,
                                   'validation_loss': dev_loss},
                          parameters={'pretrained_model': pretrained_model,
                                      'MAX_LEN': MAX_LEN,
                                      'BATCH_SIZE': BATCH_SIZE,
                                      'EPOCHS': EPOCHS,
                                      'random_state': random_state},
                          extra_data={'reviews_length': f"{plots_path}/reviews-lengths.html",
                                      'training_history': history})
    

In [15]:
#nuclio: end-code

## Test locally

In [16]:
from mlrun import code_to_function, mount_v3io, run_local, NewTask

reviews_datafile = os.path.join(os.path.abspath('..'), 'data', 'reviews.csv')
pretrained_model = 'bert-base-cased'

task = NewTask(params={'pretrained_model': pretrained_model,
                       'EPOCHS': 1},
               inputs={'reviews_dataset': reviews_datafile})
# lrun = run_local(task, handler=train_sentiment_analysis_model)

## Deploy to cluster

In [29]:
fn = code_to_function(name='train_sentiment_analysis',
                      project='stocks',
                      kind='job',
                      image='mlrun/ml-models-gpu',
                      handler='train_sentiment_analysis_model')
fn.gpus(1)
fn.apply(mount_v3io())

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f02f1fb3b00>

In [None]:
run = fn.with_code().run(task, artifact_path=os.path.abspath('../'))

> 2020-09-13 14:00:44,872 [info] starting run train-sentiment-analysis-train_sentiment_analysis_model uid=5ee470ffb9a442a0bf25478d0309f4cd  -> http://mlrun-api:8080
> 2020-09-13 14:00:45,026 [info] Job is running in the background, pod: train-sentiment-analysis-train-sentiment-analysis-model-qnrwr
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 508kB/s]  
Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors
Downloading: 100%|██████████| 433/433 [00:00<00:00, 226kB/s]
Downloading: 100%|██████████| 436M/436M [00:12<00:00, 33.5MB/s] 


In [70]:
.run.outputs

{}