# **Install and import nescessory package**

In [1]:
!pip install transformers==3.5.0

Collecting transformers==3.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 5.9MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 36.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.5MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl 

In [2]:
# Importing stock ml libraries
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import MobileBertTokenizer, MobileBertModel, MobileBertForPreTraining
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
device

'cuda'

# **try to figure out the structure**

In [None]:
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased', truncation=True, do_lower_case=True )
inputs = tokenizer.encode_plus(
            'I love lavender I love lavender I love lavender I love lavender',
            None,
            add_special_tokens=True,
            max_length=10,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
ids

[101, 1045, 2293, 20920, 1045, 2293, 20920, 1045, 2293, 102]

In [None]:
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
model = MobileBertModel.from_pretrained('google/mobilebert-uncased')
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

In [None]:
inputs

{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
outputs[0]

tensor([[[-2.5655e+07,  9.8468e+04,  1.6557e+05,  ..., -1.6260e+06,
           1.2349e+06,  2.6710e+04],
         [ 1.9210e-01,  6.7220e-01, -8.1362e-01,  ...,  5.5030e-02,
           1.4415e+00,  4.5810e+00],
         [ 9.1282e-01,  1.9443e+00,  1.5657e+00,  ..., -1.2405e-01,
          -2.7288e+00,  2.7489e+00],
         ...,
         [ 1.5894e+00,  5.9103e-01,  1.9070e+00,  ...,  2.7961e+00,
          -2.6210e+00,  3.7704e+00],
         [ 1.5775e+00,  3.8555e+00, -6.2034e-01,  ...,  2.9500e+00,
          -2.2804e+00,  2.9576e+00],
         [ 9.0074e-01,  8.5517e-01,  1.1304e+00,  ...,  1.1470e+00,
          -1.2494e+00,  6.7297e-01]]], grad_fn=<AddBackward0>)

we can see the output tensors shape is (512, )

# **Load training dataset**

In [4]:
data = pd.read_csv('/content/drive/My Drive/goemotions_aug_dairai_train_cleaned.csv')
data.drop(['Unnamed: 0', 'datasource', 'augmented'], inplace=True, axis=1)
new_df = pd.DataFrame()
new_df['text'] = data['cleaned_processed']
new_df['labels'] = data[['anger', 'fear', 'joy', 'sadness']].values.astype('int').tolist()

In [5]:
new_df.head()

Unnamed: 0,text,labels
0,i feel that some korea guy are handsome and so...,"[0, 0, 1, 0]"
1,i put my pen to paper and made a list of thing...,"[0, 0, 0, 0]"
2,i wish i only had to feel the pain of the pett...,"[1, 0, 0, 0]"
3,i feel passionate about this journey and stand...,"[0, 0, 1, 0]"
4,i feel like i have convinced myself of these f...,"[0, 0, 1, 0]"


In [6]:
new_df['len'] = new_df['text'].apply(lambda x: len(x.split(' ')))

In [7]:
new_df['len'].sort_values()

308960      1
198631      1
304858      1
164110      1
219996      1
         ... 
110501     80
224855     82
225137    101
78205     102
78768     180
Name: len, Length: 460551, dtype: int64

In [8]:
demo_df = new_df.sample(n=100000, random_state=42)

In [9]:
demo_df.head()

Unnamed: 0,text,labels,len
340232,but thank you anyway,"[0, 0, 1, 0]",4
136369,i could still feel the effect of the maple nut...,"[0, 0, 1, 0]",18
41752,i feel energetic and calm,"[0, 0, 1, 0]",5
162511,i feel more trusting of people,"[0, 0, 0, 0]",6
281481,i would not think about him sigh kinda make me...,"[0, 0, 0, 1]",26


In [10]:
MAX_LEN = 100
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
EPOCHS = 1
LEARNING_RATE = 3e-05

In [11]:

tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased', truncation=True, do_lower_case=True )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [12]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

# **make samll dataloader to test the model**

In [None]:
train_size = 0.7
train_data=demo_df.sample(frac=train_size,random_state=42)
test_data=demo_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(demo_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (100000, 3)
TRAIN Dataset: (70000, 3)
TEST Dataset: (30000, 3)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# **define MobileBertClass to finetune**

In [13]:
class MobileBERTClass(torch.nn.Module):
    def __init__(self):
        super(MobileBERTClass, self).__init__()
        self.l1 = MobileBertModel.from_pretrained("google/mobilebert-uncased")
        # self.pre_classifier = torch.nn.Linear(512, 512)
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(512, 4)
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        # pooler = self.pre_classifier(pooler)
        # pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = MobileBERTClass()
model.to(device)

MobileBERTClass(
  (l1): MobileBertModel(
    (embeddings): MobileBertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
      (LayerNorm): NoNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): MobileBertEncoder(
      (layer): ModuleList(
        (0): MobileBertLayer(
          (attention): MobileBertAttention(
            (self): MobileBertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MobileBertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=T

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
# optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.Adam(params = model.parameters(), lr=1e-04)

# **define training precess**

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%50==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)
    

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.17794927954673767


50it [00:18,  2.70it/s]

Epoch: 0, Loss:  0.11586861312389374


100it [00:37,  2.70it/s]

Epoch: 0, Loss:  0.15969204902648926


150it [00:55,  2.71it/s]

Epoch: 0, Loss:  0.19672667980194092


200it [01:14,  2.71it/s]

Epoch: 0, Loss:  0.0886101946234703


250it [01:32,  2.72it/s]

Epoch: 0, Loss:  0.09773728251457214


300it [01:50,  2.71it/s]

Epoch: 0, Loss:  0.22308534383773804


350it [02:09,  2.66it/s]

Epoch: 0, Loss:  0.13100486993789673


400it [02:27,  2.71it/s]

Epoch: 0, Loss:  0.09766919910907745


450it [02:46,  2.70it/s]

Epoch: 0, Loss:  0.15183742344379425


500it [03:04,  2.70it/s]

Epoch: 0, Loss:  0.12046046555042267


550it [03:23,  2.72it/s]

Epoch: 0, Loss:  0.12204181402921677


600it [03:41,  2.72it/s]

Epoch: 0, Loss:  0.11064457893371582


650it [04:00,  2.74it/s]

Epoch: 0, Loss:  0.08502401411533356


700it [04:18,  2.72it/s]

Epoch: 0, Loss:  0.1768840253353119


750it [04:36,  2.73it/s]

Epoch: 0, Loss:  0.15921851992607117


800it [04:55,  2.71it/s]

Epoch: 0, Loss:  0.11402826011180878


850it [05:13,  2.70it/s]

Epoch: 0, Loss:  0.09613583236932755


900it [05:32,  2.72it/s]

Epoch: 0, Loss:  0.12570920586585999


950it [05:50,  2.75it/s]

Epoch: 0, Loss:  0.09753263741731644


1000it [06:09,  2.71it/s]

Epoch: 0, Loss:  0.10480339825153351


1050it [06:27,  2.68it/s]

Epoch: 0, Loss:  0.11800608783960342


1100it [06:45,  2.68it/s]

Epoch: 0, Loss:  0.18500256538391113


1150it [07:04,  2.73it/s]

Epoch: 0, Loss:  0.11705957353115082


1200it [07:22,  2.69it/s]

Epoch: 0, Loss:  0.1059078797698021


1250it [07:41,  2.71it/s]

Epoch: 0, Loss:  0.04662709683179855


1300it [07:59,  2.75it/s]

Epoch: 0, Loss:  0.07814469188451767


1350it [08:17,  2.74it/s]

Epoch: 0, Loss:  0.08963841199874878


1400it [08:36,  2.76it/s]

Epoch: 0, Loss:  0.16308969259262085


1450it [08:54,  2.73it/s]

Epoch: 0, Loss:  0.14520618319511414


1500it [09:12,  2.72it/s]

Epoch: 0, Loss:  0.06346084922552109


1550it [09:31,  2.75it/s]

Epoch: 0, Loss:  0.12371666729450226


1600it [09:49,  2.72it/s]

Epoch: 0, Loss:  0.07953563332557678


1650it [10:07,  2.74it/s]

Epoch: 0, Loss:  0.0842801183462143


1700it [10:25,  2.72it/s]

Epoch: 0, Loss:  0.07086322456598282


1750it [10:44,  2.73it/s]

Epoch: 0, Loss:  0.16929861903190613


1800it [11:02,  2.76it/s]

Epoch: 0, Loss:  0.10164827108383179


1850it [11:20,  2.74it/s]

Epoch: 0, Loss:  0.07883759588003159


1900it [11:39,  2.72it/s]

Epoch: 0, Loss:  0.13419486582279205


1950it [11:57,  2.72it/s]

Epoch: 0, Loss:  0.09115771949291229


2000it [12:15,  2.74it/s]

Epoch: 0, Loss:  0.19094720482826233


2050it [12:34,  2.69it/s]

Epoch: 0, Loss:  0.07075570523738861


2100it [12:52,  2.73it/s]

Epoch: 0, Loss:  0.209250807762146


2150it [13:10,  2.73it/s]

Epoch: 0, Loss:  0.17611758410930634


2188it [13:24,  2.72it/s]


In [None]:
output_model_file = '/content/drive/My Drive/Mobilebert_demo_emotions_11_11_epoch6.bin'
torch.save(model, output_model_file)

In [None]:
!nvidia-smi

Wed Nov 11 21:32:09 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    33W /  70W |   4571MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Validation**

In [None]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

8636it [07:42, 18.69it/s]


In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.8419570803025368
Hamming Loss = 0.041993268917598525


**evalution I think it fails, an don't know why**


1.   /
2.   epoch 2: 
          Hamming Score = 0.34155
          Hamming Loss = 0.178075
3.   epoch 3:
          Hamming Score = 0.3859166666666667
          Hamming Loss = 0.17255833333333334
4.   epoch 4:
          Hamming Score = 0.4126166666666667
          Hamming Loss = 0.16695
5.   epoch 5 (using larger learning rate 1e-4):
          Hamming Score = 0.8067277777777778
          Hamming Loss = 0.05218333333333333
6.   epoch 5 (using larger learning rate 1e-4):
          Hamming Score = 0.8290722222222221
          Hamming Loss = 0.045875

I think we are ready to train on the whole dataset

# **train on the whole data set**

In [14]:
train_size = 0.7
train_data=new_df.sample(frac=train_size,random_state=42)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (460551, 3)
TRAIN Dataset: (322386, 3)
TEST Dataset: (138165, 3)


In [15]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [16]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%200==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)


0it [00:00, ?it/s]

Epoch: 0, Loss:  0.1490464210510254


200it [01:13,  2.68it/s]

Epoch: 0, Loss:  0.06480734795331955


400it [02:26,  2.75it/s]

Epoch: 0, Loss:  0.03098204918205738


600it [03:37,  2.84it/s]

Epoch: 0, Loss:  0.0641103982925415


800it [04:48,  2.83it/s]

Epoch: 0, Loss:  0.05647989735007286


1000it [05:57,  2.89it/s]

Epoch: 0, Loss:  0.15241219103336334


1200it [07:06,  2.89it/s]

Epoch: 0, Loss:  0.10372176766395569


1400it [08:15,  2.92it/s]

Epoch: 0, Loss:  0.08184564113616943


1600it [09:24,  2.92it/s]

Epoch: 0, Loss:  0.115203857421875


1800it [10:32,  2.88it/s]

Epoch: 0, Loss:  0.10861615836620331


2000it [11:41,  2.94it/s]

Epoch: 0, Loss:  0.07021944224834442


2200it [12:50,  2.88it/s]

Epoch: 0, Loss:  0.06903264671564102


2400it [13:59,  2.95it/s]

Epoch: 0, Loss:  0.10981154441833496


2600it [15:08,  2.91it/s]

Epoch: 0, Loss:  0.09306075423955917


2800it [16:18,  2.77it/s]

Epoch: 0, Loss:  0.1314084231853485


3000it [17:29,  2.84it/s]

Epoch: 0, Loss:  0.06153397262096405


3200it [18:41,  2.78it/s]

Epoch: 0, Loss:  0.06734946370124817


3400it [19:53,  2.72it/s]

Epoch: 0, Loss:  0.18104414641857147


3600it [21:06,  2.74it/s]

Epoch: 0, Loss:  0.06750842928886414


3800it [22:20,  2.71it/s]

Epoch: 0, Loss:  0.1145903468132019


4000it [23:35,  2.70it/s]

Epoch: 0, Loss:  0.10192157328128815


4200it [24:49,  2.73it/s]

Epoch: 0, Loss:  0.04978117346763611


4400it [26:03,  2.71it/s]

Epoch: 0, Loss:  0.05455060675740242


4600it [27:17,  2.72it/s]

Epoch: 0, Loss:  0.07955871522426605


4800it [28:31,  2.71it/s]

Epoch: 0, Loss:  0.08035440742969513


5000it [29:45,  2.70it/s]

Epoch: 0, Loss:  0.12455722689628601


5200it [30:59,  2.71it/s]

Epoch: 0, Loss:  0.10044044256210327


5400it [32:13,  2.69it/s]

Epoch: 0, Loss:  0.06587889790534973


5600it [33:26,  2.61it/s]

Epoch: 0, Loss:  0.2338360846042633


5800it [34:39,  2.74it/s]

Epoch: 0, Loss:  0.08941957354545593


6000it [35:51,  2.79it/s]

Epoch: 0, Loss:  0.09405875951051712


6200it [37:03,  2.83it/s]

Epoch: 0, Loss:  0.087922602891922


6400it [38:14,  2.79it/s]

Epoch: 0, Loss:  0.08452308177947998


6600it [39:24,  2.84it/s]

Epoch: 0, Loss:  0.1286415308713913


6800it [40:35,  2.83it/s]

Epoch: 0, Loss:  0.12175385653972626


7000it [41:45,  2.84it/s]

Epoch: 0, Loss:  0.08216293156147003


7200it [42:56,  2.84it/s]

Epoch: 0, Loss:  0.09137876331806183


7400it [44:07,  2.80it/s]

Epoch: 0, Loss:  0.08030552417039871


7600it [45:17,  2.84it/s]

Epoch: 0, Loss:  0.12752553820610046


7800it [46:28,  2.78it/s]

Epoch: 0, Loss:  0.13048028945922852


8000it [47:39,  2.81it/s]

Epoch: 0, Loss:  0.12542998790740967


8200it [48:52,  2.78it/s]

Epoch: 0, Loss:  0.0580349862575531


8400it [50:05,  2.75it/s]

Epoch: 0, Loss:  0.04822692275047302


8600it [51:17,  2.73it/s]

Epoch: 0, Loss:  0.11534909904003143


8800it [52:30,  2.79it/s]

Epoch: 0, Loss:  0.0868840217590332


9000it [53:42,  2.77it/s]

Epoch: 0, Loss:  0.1687932312488556


9200it [54:55,  2.76it/s]

Epoch: 0, Loss:  0.08542308211326599


9400it [56:07,  2.75it/s]

Epoch: 0, Loss:  0.06474511325359344


9600it [57:19,  2.79it/s]

Epoch: 0, Loss:  0.08753658831119537


9800it [58:32,  2.78it/s]

Epoch: 0, Loss:  0.04704207926988602


10000it [59:44,  2.78it/s]

Epoch: 0, Loss:  0.08049255609512329


10075it [1:00:11,  2.79it/s]


In [None]:
output_model_file = '/content/drive/My Drive/Mobilebert_demo_emotions_11_11_whole_dataset_epoch1.bin'
torch.save(model, output_model_file)

*   First epoch
        Hamming Score = 0.8419570803025368
        Hamming Loss = 0.041993268917598525

# **Evaluation on isear dataset**

In [None]:
df = pd.read_csv('/content/drive/My Drive/isear_processed_emotions.csv')
df = pd.get_dummies(df, columns=['emotions'])
df['labels'] = df[['emotions_anger', 'emotions_fear', 'emotions_joy', 'emotions_sadness']].values.astype('int').tolist()
df = df[(df['emotions_anger']==1) | (df['emotions_fear']==1) | (df['emotions_joy']==1) | (df['emotions_sadness']==1)]


In [None]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 15.7MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 28.1MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
 

In [None]:
import re
import contractions
import unicodedata
import string

In [None]:
def clean_text(text):
    """
    Function to clean text with basic steps - lower casing, dealing with contractions, remove html codes,
    strip whitespaces, social media cleaning (remove hashtags and URLS), remove punctuationns, using regular expressions.
 
    Parameters
    ----------
    text : str
            Text to be cleaned
    
    Returns
    -------
    text : str
            Cleaned text
    """
    # Lower casing
    text = text.lower()
    
    
    # Remove html codes
    text = re.sub(r"&amp;", " ", text)
    text = re.sub(r"&quot;", " ", text)
    text = re.sub(r"&#39;", " ", text)
    text = re.sub(r"&gt;", " ", text)
    text = re.sub(r"&lt;", " ", text)
    
    # Strips (removes) whitespaces
    text = text.strip(' ')
    
    ################ Social media cleaning ############
    
    # Remove hashtags (Regex @[A-Za-z0-9]+ represents mentions and #[A-Za-z0-9]+ represents hashtags. )
    text = re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", text)
    
    # Remove URLS (Regex \w+:\/\/\S+ matches all the URLs starting with http:// or https:// and replacing it with space.)
    text = re.sub("(\w+:\/\/\S+)", " ", text)
    text = re.sub(r'http\S+', ' ', text)
    
     # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove @users
    text = re.sub(r'@[\w]*', '', text)
    # remove Reddit channel reference /r
    text = re.sub(r'r/', '', text)
    
    # remove reddit username
    text = re.sub(r'u/[\w]*', '', text)
    # remove '&gt;' like notations
    text = re.sub('&\W*\w*\W*;', ' ', text)
    # remove hashtags
    text = re.sub(r'#[\w]*', '', text)
    ###################################################
    
    # Dealing with contractions
    text = contractions.fix(text)
    
    text = re.sub(r"what\'s", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can\'t", "can not ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"\'t", " not", text )
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"\'em'", " them ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    
    
    # Removes punctuations
    text = re.sub('['+string.punctuation+']', " ", text)
    
	# Removes non alphanumeric characters
    #text = re.sub('\W', ' ', text)
    
    # Removes non alphabetical characters
    text = re.sub('[^a-zA-Z]+', ' ', text)
    
    # Replaces all whitespaces by 1 whitespace
    text = re.sub('\s+', ' ', text)
    
    return text

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
model.eval()
pred = []
from tqdm import tqdm
for text in tqdm(df.text.values):
  input = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
  ids = torch.tensor([input['input_ids']], dtype=torch.long)
  mask = torch.tensor([input['attention_mask']], dtype=torch.long)
  token_type_ids = torch.tensor([input["token_type_ids"]], dtype=torch.long)
  # to device
  ids = ids.to(device, dtype = torch.long)
  mask = mask.to(device, dtype = torch.long)
  token_type_ids = token_type_ids.to(device, dtype = torch.long)

  output = model(ids, mask, token_type_ids)
  pred.append(torch.sigmoid(output).cpu().detach().numpy().tolist())


  0%|          | 0/4381 [00:00<?, ?it/s][A
  0%|          | 2/4381 [00:00<05:01, 14.54it/s][A
  0%|          | 4/4381 [00:00<04:49, 15.10it/s][A
  0%|          | 6/4381 [00:00<04:40, 15.59it/s][A
  0%|          | 8/4381 [00:00<04:35, 15.89it/s][A
  0%|          | 10/4381 [00:00<04:36, 15.80it/s][A
  0%|          | 12/4381 [00:00<04:31, 16.11it/s][A
  0%|          | 14/4381 [00:00<04:30, 16.17it/s][A
  0%|          | 16/4381 [00:00<04:31, 16.07it/s][A
  0%|          | 18/4381 [00:01<04:31, 16.06it/s][A
  0%|          | 20/4381 [00:01<04:30, 16.11it/s][A
  1%|          | 22/4381 [00:01<04:29, 16.20it/s][A
  1%|          | 24/4381 [00:01<04:26, 16.36it/s][A
  1%|          | 26/4381 [00:01<04:24, 16.47it/s][A
  1%|          | 28/4381 [00:01<04:29, 16.15it/s][A
  1%|          | 30/4381 [00:01<04:27, 16.30it/s][A
  1%|          | 32/4381 [00:01<04:21, 16.60it/s][A
  1%|          | 34/4381 [00:02<04:25, 16.36it/s][A
  1%|          | 36/4381 [00:02<04:24, 16.43it/s][A
  1%|

In [None]:
pred_y = np.array(pred)
pred_y = pred_y.squeeze(axis=1)
for i in range(len(pred_y)):
  for j in range(4):
    pred_y[i, j] =1 if pred_y[i, j] >= 0.5 else 0
pred_y = pred_y.astype('int')

real_y = df.labels.values.tolist()
real_y = np.array(real_y)
real_y = real_y.astype('int')

**overall metrics**

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
precision_score(real_y, pred_y, average='macro')

0.7949025458427011

In [None]:
recall_score(real_y, pred_y, average='macro')

0.5714911446088786

In [None]:
f1_score(real_y, pred_y, average='macro')

0.601545378492069

**precision score**

In [None]:
precision = [precision_score(real_y[:, 0], pred_y[:, 0]),
             precision_score(real_y[:, 1], pred_y[:, 1]),
             precision_score(real_y[:, 2], pred_y[:, 2]),
             precision_score(real_y[:, 3], pred_y[:, 3]),
             precision_score(real_y, pred_y, average='macro'),
             precision_score(real_y, pred_y, average='micro'),
             precision_score(real_y, pred_y, average='weighted')]

**recall score**

In [None]:
recall = [recall_score(real_y[:, 0], pred_y[:, 0]),
          recall_score(real_y[:, 1], pred_y[:, 1]),
          recall_score(real_y[:, 2], pred_y[:, 2]),
          recall_score(real_y[:, 3], pred_y[:, 3]),
          recall_score(real_y, pred_y, average='macro'),
          recall_score(real_y, pred_y, average='micro'),
          recall_score(real_y, pred_y, average='weighted')]

**f1 score**

In [None]:
f1 = [f1_score(real_y[:, 0], pred_y[:, 0]),
      f1_score(real_y[:, 1], pred_y[:, 1]),
      f1_score(real_y[:, 2], pred_y[:, 2]),
      f1_score(real_y[:, 3], pred_y[:, 3]),
      f1_score(real_y, pred_y, average='macro'),
      f1_score(real_y, pred_y, average='micro'),
      f1_score(real_y, pred_y, average='weighted')]

**overall**

In [None]:
metric = pd.DataFrame([precision, recall, f1], columns=['anger', 'fear', 'joy', 'sadness', 'macro', 'micro', 'weighted'], index=['precision', 'recall','f1'])

In [None]:
metric

Unnamed: 0,anger,fear,joy,sadness,macro,micro,weighted
precision,0.443649,0.884309,0.922917,0.928736,0.794903,0.641558,0.794824
recall,0.905109,0.607306,0.404936,0.368613,0.571491,0.571559,0.571559
f1,0.595438,0.720087,0.562897,0.52776,0.601545,0.604539,0.601536


# **load the model and train on the whole data set fro some more epochs**

In [17]:
train_data=new_df
training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }
training_loader = DataLoader(training_set, **train_params)

the load time is longer than before...

In [18]:
class MobileBERTClass(torch.nn.Module):
    def __init__(self):
        super(MobileBERTClass, self).__init__()
        self.l1 = MobileBertModel.from_pretrained("google/mobilebert-uncased")
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(512, 4)
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = MobileBERTClass()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=560.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=146671951.0, style=ProgressStyle(descri…




In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/mobileBert_demo_emotions_state_dict_11_14_0'), strict=False)

In [None]:
model.to(device)

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%200==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

after the pretraining, we should use smaller learning rate to train on the training set.

in the same training stage, the performance is much worse than distilBert, which is not expectable

and also is oscillating.... it seems it doesn't converging

In [None]:
EPOCHS = 3
for epoch in range(EPOCHS):
    train(epoch)
    torch.save(model.state_dict(), f'/content/drive/My Drive/mobileBert_demo_emotions_state_dict_11_14_{epoch}')

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.07767188549041748


200it [01:07,  3.02it/s]

Epoch: 0, Loss:  0.0635480061173439


400it [02:13,  3.07it/s]

Epoch: 0, Loss:  0.05889982357621193


600it [03:20,  3.05it/s]

Epoch: 0, Loss:  0.056054823100566864


800it [04:27,  3.09it/s]

Epoch: 0, Loss:  0.043265461921691895


1000it [05:33,  3.01it/s]

Epoch: 0, Loss:  0.07708185911178589


1200it [06:40,  3.06it/s]

Epoch: 0, Loss:  0.03707611933350563


1400it [07:47,  2.99it/s]

Epoch: 0, Loss:  0.06189068406820297


1600it [08:55,  2.86it/s]

Epoch: 0, Loss:  0.08397328108549118


1800it [10:01,  3.09it/s]

Epoch: 0, Loss:  0.05474775657057762


2000it [11:09,  2.54it/s]

Epoch: 0, Loss:  0.06120752915740013


2200it [12:18,  2.91it/s]

Epoch: 0, Loss:  0.10535497218370438


2400it [13:26,  2.88it/s]

Epoch: 0, Loss:  0.025645602494478226


2600it [14:34,  3.10it/s]

Epoch: 0, Loss:  0.06604814529418945


2800it [15:41,  2.80it/s]

Epoch: 0, Loss:  0.055980484932661057


3000it [16:47,  2.87it/s]

Epoch: 0, Loss:  0.09529384970664978


3200it [17:56,  3.04it/s]

Epoch: 0, Loss:  0.0436236634850502


3400it [19:04,  3.00it/s]

Epoch: 0, Loss:  0.03342234343290329


3600it [20:13,  3.09it/s]

Epoch: 0, Loss:  0.1487358659505844


3800it [21:21,  2.83it/s]

Epoch: 0, Loss:  0.05567188560962677


4000it [22:29,  3.07it/s]

Epoch: 0, Loss:  0.05427462235093117


4200it [23:36,  3.00it/s]

Epoch: 0, Loss:  0.037816282361745834


4400it [24:45,  2.97it/s]

Epoch: 0, Loss:  0.05792933702468872


4600it [25:53,  3.13it/s]

Epoch: 0, Loss:  0.19238333404064178


4800it [26:59,  3.03it/s]

Epoch: 0, Loss:  0.09573774039745331


5000it [28:07,  2.98it/s]

Epoch: 0, Loss:  0.08343780040740967


5200it [29:15,  3.04it/s]

Epoch: 0, Loss:  0.07067771255970001


5400it [30:21,  3.10it/s]

Epoch: 0, Loss:  0.05585784465074539


5600it [31:28,  3.04it/s]

Epoch: 0, Loss:  0.1183716431260109


5800it [32:34,  3.11it/s]

Epoch: 0, Loss:  0.07357296347618103


6000it [33:42,  3.05it/s]

Epoch: 0, Loss:  0.12083947658538818


6200it [34:50,  3.03it/s]

Epoch: 0, Loss:  0.040204767137765884


6400it [35:58,  3.00it/s]

Epoch: 0, Loss:  0.03860725462436676


6600it [37:06,  2.85it/s]

Epoch: 0, Loss:  0.09296616911888123


6800it [38:12,  3.08it/s]

Epoch: 0, Loss:  0.1480240523815155


7000it [39:20,  3.09it/s]

Epoch: 0, Loss:  0.026508726179599762


7200it [40:27,  3.09it/s]

Epoch: 0, Loss:  0.07271841168403625


7400it [41:34,  3.14it/s]

Epoch: 0, Loss:  0.08297023177146912


7600it [42:40,  3.05it/s]

Epoch: 0, Loss:  0.09086933732032776


7800it [43:47,  3.12it/s]

Epoch: 0, Loss:  0.0609767884016037


8000it [44:53,  3.17it/s]

Epoch: 0, Loss:  0.040690939873456955


8200it [45:59,  2.87it/s]

Epoch: 0, Loss:  0.08915390819311142


8400it [47:05,  2.70it/s]

Epoch: 0, Loss:  0.07123054563999176


8600it [48:11,  3.00it/s]

Epoch: 0, Loss:  0.055954426527023315


8800it [49:17,  2.84it/s]

Epoch: 0, Loss:  0.052992574870586395


9000it [50:23,  3.07it/s]

Epoch: 0, Loss:  0.13166064023971558


9200it [51:30,  2.89it/s]

Epoch: 0, Loss:  0.04658003896474838


9400it [52:38,  3.10it/s]

Epoch: 0, Loss:  0.05435784533619881


9600it [53:44,  3.00it/s]

Epoch: 0, Loss:  0.0667671486735344


9800it [54:51,  3.10it/s]

Epoch: 0, Loss:  0.09951731562614441


10000it [55:57,  3.02it/s]

Epoch: 0, Loss:  0.04779081791639328


10200it [57:03,  3.06it/s]

Epoch: 0, Loss:  0.06297709792852402


10400it [58:10,  2.89it/s]

Epoch: 0, Loss:  0.04230578988790512


10600it [59:16,  3.16it/s]

Epoch: 0, Loss:  0.07063059508800507


10800it [1:00:23,  3.12it/s]

Epoch: 0, Loss:  0.04801899939775467


11000it [1:01:29,  3.18it/s]

Epoch: 0, Loss:  0.039434123784303665


11200it [1:02:35,  3.14it/s]

Epoch: 0, Loss:  0.059074558317661285


11400it [1:03:40,  3.19it/s]

Epoch: 0, Loss:  0.05979689955711365


11600it [1:04:46,  3.00it/s]

Epoch: 0, Loss:  0.09359181672334671


11800it [1:05:50,  2.94it/s]

Epoch: 0, Loss:  0.0760791078209877


12000it [1:06:54,  2.97it/s]

Epoch: 0, Loss:  0.05551666021347046


12200it [1:08:00,  3.10it/s]

Epoch: 0, Loss:  0.07187332957983017


12400it [1:09:06,  2.93it/s]

Epoch: 0, Loss:  0.051614295691251755


12600it [1:10:12,  3.04it/s]

Epoch: 0, Loss:  0.04371259734034538


12800it [1:11:18,  2.76it/s]

Epoch: 0, Loss:  0.030086476355791092


13000it [1:12:24,  2.82it/s]

Epoch: 0, Loss:  0.07054010778665543


13200it [1:13:30,  3.15it/s]

Epoch: 0, Loss:  0.052194900810718536


13400it [1:14:35,  3.16it/s]

Epoch: 0, Loss:  0.06830467283725739


13600it [1:15:40,  3.14it/s]

Epoch: 0, Loss:  0.05527697503566742


13800it [1:16:46,  3.14it/s]

Epoch: 0, Loss:  0.04817444086074829


14000it [1:17:52,  3.16it/s]

Epoch: 0, Loss:  0.10022753477096558


14200it [1:18:57,  2.93it/s]

Epoch: 0, Loss:  0.065208300948143


14393it [1:20:02,  3.00it/s]
0it [00:00, ?it/s]

Epoch: 1, Loss:  0.05708610638976097


200it [01:06,  3.10it/s]

Epoch: 1, Loss:  0.05123711749911308


400it [02:12,  2.98it/s]

Epoch: 1, Loss:  0.019720707088708878


600it [03:16,  3.08it/s]

Epoch: 1, Loss:  0.05841827392578125


800it [04:21,  3.20it/s]

Epoch: 1, Loss:  0.04863228276371956


1000it [05:25,  3.00it/s]

Epoch: 1, Loss:  0.10434792935848236


1200it [06:30,  3.07it/s]

Epoch: 1, Loss:  0.0466642752289772


1400it [07:34,  3.18it/s]

Epoch: 1, Loss:  0.07503263652324677


1600it [08:38,  3.20it/s]

Epoch: 1, Loss:  0.04445643723011017


1800it [09:43,  3.18it/s]

Epoch: 1, Loss:  0.05892457440495491


2000it [10:47,  3.20it/s]

Epoch: 1, Loss:  0.08406779170036316


2200it [11:54,  3.00it/s]

Epoch: 1, Loss:  0.026806792244315147


2400it [12:59,  2.89it/s]

Epoch: 1, Loss:  0.03803079202771187


2600it [14:04,  2.91it/s]

Epoch: 1, Loss:  0.04756658524274826


2800it [15:09,  3.00it/s]

Epoch: 1, Loss:  0.05638827756047249


3000it [16:14,  3.10it/s]

Epoch: 1, Loss:  0.15413600206375122


3200it [17:19,  2.83it/s]

Epoch: 1, Loss:  0.05773472413420677


3400it [18:24,  3.20it/s]

Epoch: 1, Loss:  0.03903431445360184


3422it [18:32,  3.01it/s]