# **necessory packages**

In [None]:
! pip install transformers==3.0.2

Collecting transformers==3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 13.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 30.3MB/s 
[?25hCollecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 47.0MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64

In [None]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
device

'cuda'

# **load dataframe**

In [None]:
data = pd.read_csv('/content/drive/My Drive/goemotions_aug_dairai_train_cleaned.csv')

In [None]:
data.drop(['Unnamed: 0', 'datasource', 'augmented'], inplace=True, axis=1)

In [None]:
data.head()

Unnamed: 0,anger,cleaned_processed,fear,joy,sadness
0,0.0,i feel that some korea guy are handsome and so...,0.0,1.0,0.0
1,0.0,i put my pen to paper and made a list of thing...,0.0,0.0,0.0
2,1.0,i wish i only had to feel the pain of the pett...,0.0,0.0,0.0
3,0.0,i feel passionate about this journey and stand...,0.0,1.0,0.0
4,0.0,i feel like i have convinced myself of these f...,0.0,1.0,0.0


In [None]:
new_df = pd.DataFrame()
new_df['text'] = data['cleaned_processed']
new_df['labels'] = data[['anger', 'fear', 'joy', 'sadness']].values.astype('int').tolist()

In [None]:
new_df.head()

Unnamed: 0,text,labels
0,i feel that some korea guy are handsome and so...,"[0, 0, 1, 0]"
1,i put my pen to paper and made a list of thing...,"[0, 0, 0, 0]"
2,i wish i only had to feel the pain of the pett...,"[1, 0, 0, 0]"
3,i feel passionate about this journey and stand...,"[0, 0, 1, 0]"
4,i feel like i have convinced myself of these f...,"[0, 0, 1, 0]"


# **make Dataset**

In [None]:
MAX_LEN = 100
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 3e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_size = 0.7
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (460551, 2)
TRAIN Dataset: (322386, 2)
TEST Dataset: (138165, 2)


# **make dataloader**

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# **define DistilBertClass**

In [None]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
# model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




# **define loss function and optimizer**

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

# **define training precess**

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
model = torch.load('/content/drive/My Drive/distilbert_demo_emotions_11_1_epoch4.bin')
model.to(device)
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.07957722246646881


500it [04:35,  1.81it/s]

Epoch: 0, Loss:  0.07897035777568817


1000it [09:09,  1.83it/s]

Epoch: 0, Loss:  0.08990435302257538


1500it [13:43,  1.83it/s]

Epoch: 0, Loss:  0.10907003283500671


2000it [18:18,  1.83it/s]

Epoch: 0, Loss:  0.06448429077863693


2500it [22:52,  1.82it/s]

Epoch: 0, Loss:  0.09330804646015167


3000it [27:26,  1.82it/s]

Epoch: 0, Loss:  0.08278612792491913


3500it [32:00,  1.81it/s]

Epoch: 0, Loss:  0.05188858509063721


4000it [36:34,  1.83it/s]

Epoch: 0, Loss:  0.09143951535224915


4500it [41:08,  1.82it/s]

Epoch: 0, Loss:  0.06304386258125305


5000it [45:43,  1.82it/s]

Epoch: 0, Loss:  0.0636063665151596


5038it [46:04,  1.82it/s]


In [None]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.029533173888921738


500it [04:35,  1.82it/s]

Epoch: 0, Loss:  0.04037821292877197


1000it [09:09,  1.83it/s]

Epoch: 0, Loss:  0.05222412943840027


1500it [13:43,  1.82it/s]

Epoch: 0, Loss:  0.05400863662362099


2000it [18:16,  1.83it/s]

Epoch: 0, Loss:  0.056569915264844894


2500it [22:50,  1.82it/s]

Epoch: 0, Loss:  0.0391896590590477


3000it [27:24,  1.83it/s]

Epoch: 0, Loss:  0.04864323511719704


3500it [31:58,  1.85it/s]

Epoch: 0, Loss:  0.02874768152832985


4000it [36:31,  1.83it/s]

Epoch: 0, Loss:  0.0316607728600502


4500it [41:04,  1.83it/s]

Epoch: 0, Loss:  0.12156212329864502


5000it [45:38,  1.82it/s]

Epoch: 0, Loss:  0.02846132405102253


5038it [45:58,  1.83it/s]


In [None]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.03433572128415108


500it [04:29,  1.83it/s]

Epoch: 0, Loss:  0.02914523147046566


1000it [09:03,  1.83it/s]

Epoch: 0, Loss:  0.03389030694961548


1500it [13:37,  1.83it/s]

Epoch: 0, Loss:  0.04576798528432846


2000it [18:11,  1.83it/s]

Epoch: 0, Loss:  0.02045069821178913


2500it [22:45,  1.83it/s]

Epoch: 0, Loss:  0.0376528725028038


3000it [27:18,  1.83it/s]

Epoch: 0, Loss:  0.03330596536397934


3500it [31:52,  1.83it/s]

Epoch: 0, Loss:  0.04545937106013298


4000it [36:26,  1.83it/s]

Epoch: 0, Loss:  0.031124170869588852


4500it [41:00,  1.83it/s]

Epoch: 0, Loss:  0.03004983626306057


5000it [45:33,  1.83it/s]

Epoch: 0, Loss:  0.014025482349097729


5038it [45:54,  1.83it/s]


In [None]:
model = torch.load('/content/drive/My Drive/distilbert_demo_emotions_11_1_epoch4.bin')
model.to(device)
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.06805876642465591


500it [07:33,  1.10it/s]

Epoch: 0, Loss:  0.014274067245423794


1000it [15:08,  1.10it/s]

Epoch: 0, Loss:  0.029247058555483818


1500it [22:43,  1.10it/s]

Epoch: 0, Loss:  0.010428293608129025


2000it [30:18,  1.10it/s]

Epoch: 0, Loss:  0.0324266217648983


2500it [37:51,  1.10it/s]

Epoch: 0, Loss:  0.03569381311535835


3000it [45:25,  1.10it/s]

Epoch: 0, Loss:  0.02159108780324459


3500it [52:58,  1.11it/s]

Epoch: 0, Loss:  0.027874628081917763


4000it [1:00:31,  1.10it/s]

Epoch: 0, Loss:  0.019178619608283043


4500it [1:08:04,  1.10it/s]

Epoch: 0, Loss:  0.04247869551181793


5000it [1:15:37,  1.11it/s]

Epoch: 0, Loss:  0.01370925921946764


5038it [1:16:11,  1.10it/s]


In [None]:
output_model_file = '/content/drive/My Drive/distilbert_demo_emotions_11_1_epoch5.bin'
torch.save(model, output_model_file)

# **TRAIN ON WHOLE DATASET**

In [None]:
whole_set = MultiLabelDataset(new_df, tokenizer, MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader_whole_dataset = DataLoader(whole_set, **train_params)

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/distilbert_demo_emotions_state_dict_11_16_epoch2'))
model.to(device)


In [None]:
def train_full(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader_whole_dataset, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
EPOCHS = 6
for epoch in range(EPOCHS):
    train_full(epoch)
    torch.save(model.state_dict(), f'/content/drive/My Drive/distilbert_demo_emotions_state_dict_11_17_epoch{epoch}')

# **define validation process**


*   attention: outputs and targets are tensor on device that you pointed out
*   when you want you change them into numpy arrays, you first need to pull them to cpu and detach, then you can call numpy() to change them to numpy array



In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

4318it [07:33,  9.52it/s]


In [None]:

val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.899219532201836
Hamming Loss = 0.027143270727029278


# **Load Model and Inference**

In [None]:
predictor = torch.load('/content/drive/My Drive/distilbert_demo_emotions.bin')
predictor.to(device)
predictor.eval()

In [None]:
text = 'F1 is the greatest sport in the world'
input = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
ids = [input['input_ids']]
ids = torch.tensor(ids, dtype=torch.long)
mask = [input['attention_mask']]
mask = torch.tensor(mask, dtype=torch.long)
token_type_ids = [input["token_type_ids"]]
token_type_ids = torch.tensor(token_type_ids, dtype=torch.long)

In [None]:
import time

In [None]:
ids = ids.to(device, dtype = torch.long)
mask = mask.to(device, dtype = torch.long)
token_type_ids = token_type_ids.to(device, dtype = torch.long)


t1 = time.localtime()
current_time_1 = time.strftime("%H:%M:%S", t1)
print(current_time_1)
outputs = predictor(ids, mask, token_type_ids)
t2 = time.localtime()
current_time_2 = time.strftime("%H:%M:%S", t2)
print(current_time_2)

20:16:56
20:16:56


In [None]:
torch.sigmoid(outputs).cpu().detach().numpy().tolist()

[[0.006591061130166054,
  0.0009418923873454332,
  0.8798263072967529,
  0.002585391979664564]]