In [1]:
from tqdm import tqdm
import pandas as pd
import os
from functools import partial
import numpy as np
import time

import torch
import torch.nn as nn

from torch.utils.data import DataLoader, random_split
from torch.utils.data.dataset import Dataset
from transformers import BertPreTrainedModel, BertTokenizer, BertConfig, BertModel, AutoConfig
from functools import partial
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.environ["CUDA_VISIBLE_DEVICES"] = '3'
from google.colab import drive
drive.mount('/content/drive')
os.chdir("drive/MyDrive/scripted-character-emotion-recognition")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
with open('data/train_dataset_v2.tsv', 'r', encoding='utf-8') as handler:
    lines = handler.read().split('\n')[1: -1]

    data = list()
    for line in tqdm(lines):
        sp = line.split('\t')
        if len(sp) != 4:
            print("Error: ", sp)
            continue
        data.append(sp)

100%|██████████| 42790/42790 [00:00<00:00, 889957.99it/s]


In [3]:
train = pd.DataFrame(data)
train.columns = ['id', 'content', 'character', 'emotions']

test = pd.read_csv('data/test_dataset.tsv', sep='\t')
submit = pd.read_csv('data/submit_example.tsv', sep='\t')

train = train[train['emotions'] != '']

train['text'] = train['content'].astype(str)  + ' 角色: ' + train['character'].astype(str)
test['text'] = test['content'].astype(str) + ' 角色: ' + test['character'].astype(str)

train['emotions'] = train['emotions'].apply(lambda x: [int(_i) for _i in x.split(',')])

target_cols = ['love', 'joy', 'fright', 'anger', 'fear', 'sorrow']
train[target_cols] = train['emotions'].values.tolist()
test[target_cols] = [0, 0, 0, 0, 0, 0]

train.to_csv('data/tarin.csv', columns=['id', 'content', 'character', 'text', 'love', 'joy', 'fright', 'anger', 'fear', 'sorrow'],
             sep='\t', index=False)

test.to_csv('data/test.csv', columns=['id', 'content', 'character', 'text', 'love', 'joy', 'fright', 'anger', 'fear', 'sorrow'],
             sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['text'] = train['content'].astype(str)  + ' 角色: ' + train['character'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['emotions'] = train['emotions'].apply(lambda x: [int(_i) for _i in x.split(',')])


In [4]:
class RoleDataset(Dataset):
    def __init__(self, tokenizer, max_len, mode='train'):
        super(RoleDataset, self).__init__()
        if mode == 'train':
            self.data = pd.read_csv('data/train.csv', sep='\t')
        else:
            self.data = pd.read_csv('data/test.csv', sep='\t')
        self.texts = self.data['text'].tolist()  # [..., ]
        self.labels = self.data[target_cols].to_dict('records')  # [{'love': 0, 'joy': 0, ...}, {'love': 0, ...}, ...]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]  # {'love': 0, 'joy': 0, ...}

        encoding = self.tokenizer.encode_plus(text,
                                              add_special_tokens=True,
                                              return_token_type_ids=True, return_attention_mask=True,
                                              max_length=self.max_len, padding='max_length',
                                              truncation=True, return_tensors='pt')
        # return_tensors='pt' makes the returning data [batch_size, dim]-like torch tensors, instead of a simple py-list

        sample = {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten()
        }
        sample['emotions'] = torch.tensor([label[label_col] / 3.0 for label_col in target_cols], dtype=torch.float)
        # possible emotion value: 0, 1, 2, 3, scale

        return sample

    def __len__(self):
        return len(self.texts)


In [5]:
def create_dataloader(dataset, batch_size, mode='train'):
    shuffle = True if mode == 'train' else False
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [6]:
def init_params(module_list):
    for module in module_list:
        for param in module.parameters():
            if param.dim() > 1:
                torch.nn.init.xavier_uniform_(param)
    return

In [7]:
class IQIYIModelLite(nn.Module):
    def __init__(self, n_classes, model_name):
        super(IQIYIModelLite, self).__init__()
        config = AutoConfig.from_pretrained(model_name)
        config.update({"output_hidden_states": True,
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})

        self.base = BertModel.from_pretrained(model_name, config=config)

        dim = 1024 if 'large' in model_name else 768

        self.attention = nn.Sequential(
            nn.Linear(dim, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)  # softmax across words in one seqence
        )

        # self.out = nn.ModuleList([nn.Sequential(nn.Linear(dim, n_classes)) for _ in range(len(target_cols))])
        self.out = nn.Sequential(nn.Linear(dim, n_classes))
        init_params([self.out, self.attention])

    def forward(self, input_ids, attention_mask):
        roberta_output = self.base(input_ids=input_ids, attention_mask=attention_mask)

        last_layer_hidden_states = roberta_output.hidden_states[-1]  # (batch_size, seq_len, dim)
        weights = self.attention(last_layer_hidden_states)

        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)  # representation of the sequence, (batch_size, dim)

        return self.out(context_vector)

In [8]:
PRE_TRAINED_MODEL_NAME = 'hfl/chinese-roberta-wwm-ext'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

In [9]:
EPOCHS = 2
weight_decay = 0.0
data_path = 'data'
warmup_proportion = 0.0
batch_size = 16
lr = 1e-5
max_len = 128
train_pct = 0.8

warmup_ratio = 0

labelledset = RoleDataset(tokenizer, max_len, mode='train')
trainsize = int(train_pct * len(labelledset))
trainset, valset = random_split(labelledset, [trainsize, len(labelledset) - trainsize])

train_loader = create_dataloader(trainset, batch_size, mode='train')
val_loader = create_dataloader(valset, batch_size, mode='val')

testset = RoleDataset(tokenizer, max_len, mode='test')
test_loader = create_dataloader(testset, batch_size, mode='test')

model = IQIYIModelLite(n_classes=len(target_cols), model_name=PRE_TRAINED_MODEL_NAME).to(device)
if torch.cuda.device_count() > 1:
		model = nn.DataParallel(model)

optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
	optimizer,
	num_warmup_steps=warmup_ratio * total_steps,
	num_training_steps=total_steps
)

criterion = nn.BCEWithLogitsLoss(reduction='none').to(device)

pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

In [10]:
def train(model, data_loader, criterion, optimizer, scheduler, metric=None):
    print("Training start...")
    model.train()
    global_step = 0
    tic_train = time.time()
    log_steps = 100
    for epoch in range(EPOCHS):
        losses = []
        for step, sample in enumerate(data_loader):
            input_ids = sample['input_ids'].to(device)
            attention_mask = sample['attention_mask'].to(device)

            output = model(input_ids=input_ids, attention_mask=attention_mask)  # (batch_size, class)

            loss = criterion(output, sample['emotions'].to(device))
            loss = loss.mean(axis=0).sum()
            losses.append(loss.item())
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            global_step += 1
            if global_step % log_steps == 0:
                print("global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s, lr: %.10f"
                      % (global_step, epoch + 1, step + 1, np.mean(losses), global_step / (time.time() - tic_train),
                         float(scheduler.get_last_lr()[0])))

In [11]:
train(model, train_loader, criterion, optimizer, scheduler)

Training start...
global step 100, epoch: 1, batch: 100, loss: 1.18705, speed: 3.02 step/s, lr: 0.0000097283
global step 200, epoch: 1, batch: 200, loss: 1.10871, speed: 3.01 step/s, lr: 0.0000094565
global step 300, epoch: 1, batch: 300, loss: 1.08313, speed: 2.94 step/s, lr: 0.0000091848
global step 400, epoch: 1, batch: 400, loss: 1.05432, speed: 2.85 step/s, lr: 0.0000089130
global step 500, epoch: 1, batch: 500, loss: 1.03206, speed: 2.83 step/s, lr: 0.0000086413
global step 600, epoch: 1, batch: 600, loss: 1.00708, speed: 2.80 step/s, lr: 0.0000083696
global step 700, epoch: 1, batch: 700, loss: 0.98747, speed: 2.78 step/s, lr: 0.0000080978
global step 800, epoch: 1, batch: 800, loss: 0.97371, speed: 2.77 step/s, lr: 0.0000078261
global step 900, epoch: 1, batch: 900, loss: 0.96574, speed: 2.76 step/s, lr: 0.0000075543
global step 1000, epoch: 1, batch: 1000, loss: 0.96071, speed: 2.75 step/s, lr: 0.0000072826
global step 1100, epoch: 1, batch: 1100, loss: 0.95496, speed: 2.75 st

In [20]:
def eval(model, data_loader):
    model.eval()
    pred = []
    labels = []
    for step, batch in tqdm(enumerate(data_loader)):
        b_input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['emotions']
        with torch.no_grad():
            logistics = model(input_ids=b_input_ids, attention_mask=attention_mask)
            out2 = logistics.sigmoid() * 3.0
            pred.append(out2.cpu().numpy())
            labels.append(label.cpu().numpy())
    return np.concatenate(pred, axis=0), np.concatenate(labels, axis=0)

def score(preds, labels):
    rmse = np.sqrt((((preds - labels) ** 2).sum() / (preds.shape[0] * preds.shape[1])))
    return 1 / (1 + rmse)

val_pred, val_labels = eval(model, val_loader)
print(score(val_pred, val_labels))

test_pred, _ = eval(model, test_loader)

460it [00:51,  8.94it/s]


0.7851538474853487


1336it [02:35,  8.57it/s]


In [21]:
submit = pd.read_csv('data/submit_example.tsv', sep='\t')
print(len(test_pred))

21376


In [22]:
sub = submit.copy()

sub['emotion'] = test_pred.tolist()
sub['emotion'] = sub['emotion'].apply(lambda x: ','.join([str(i) for i in x]))
sub.to_csv('data/baseline_{}.tsv'.format(PRE_TRAINED_MODEL_NAME.split('/')[-1]), sep='\t', index=False)
sub.head()

Unnamed: 0,id,emotion
0,34170_0002_A_12,"0.012768270447850227,0.015051975846290588,0.10..."
1,34170_0002_A_14,"0.0030106264166533947,0.002574578858911991,0.0..."
2,34170_0003_A_16,"0.012103961780667305,0.019572891294956207,0.01..."
3,34170_0003_A_17,"0.08836838603019714,0.008623330853879452,0.011..."
4,34170_0003_A_18,"0.03621571511030197,0.005680045112967491,0.041..."
