In [1]:
! pip install transformers==3.5.0

Collecting transformers==3.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/34/fb092588df61bf33f113ade030d1cbe74fb73a0353648f8dd938a223dce7/transformers-3.5.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 6.0MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 40.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.1MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.9MB

In [2]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import MobileBertTokenizer, MobileBertModel
import logging
logging.basicConfig(level=logging.ERROR)

In [3]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [4]:
# test using cpu
device = 'cpu'

In [5]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [6]:
!lscpu

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              2
On-line CPU(s) list: 0,1
Thread(s) per core:  2
Core(s) per socket:  1
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               63
Model name:          Intel(R) Xeon(R) CPU @ 2.30GHz
Stepping:            0
CPU MHz:             2300.000
BogoMIPS:            4600.00
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            46080K
NUMA node0 CPU(s):   0,1
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single ssbd ibrs 

# **load and preprocess the dataframe**

In [7]:
df = pd.read_csv('/content/drive/My Drive/isear_processed_emotions.csv')

In [8]:
df = pd.get_dummies(df, columns=['emotions'])

In [9]:
df['labels'] = df[['emotions_anger', 'emotions_fear', 'emotions_joy', 'emotions_sadness']].values.astype('int').tolist()

In [10]:
df = df[(df['emotions_anger']==1) | (df['emotions_fear']==1) | (df['emotions_joy']==1) | (df['emotions_sadness']==1)]

In [11]:
len(df)

4381

In [12]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 6.0MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 25.7MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  

In [13]:
import re
import contractions
import unicodedata
import string

In [14]:
def clean_text(text):
    """
    Function to clean text with basic steps - lower casing, dealing with contractions, remove html codes,
    strip whitespaces, social media cleaning (remove hashtags and URLS), remove punctuationns, using regular expressions.
 
    Parameters
    ----------
    text : str
            Text to be cleaned
    
    Returns
    -------
    text : str
            Cleaned text
    """
    # Lower casing
    text = text.lower()
    
    
    # Remove html codes
    text = re.sub(r"&amp;", " ", text)
    text = re.sub(r"&quot;", " ", text)
    text = re.sub(r"&#39;", " ", text)
    text = re.sub(r"&gt;", " ", text)
    text = re.sub(r"&lt;", " ", text)
    
    # Strips (removes) whitespaces
    text = text.strip(' ')
    
    ################ Social media cleaning ############
    
    # Remove hashtags (Regex @[A-Za-z0-9]+ represents mentions and #[A-Za-z0-9]+ represents hashtags. )
    text = re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", text)
    
    # Remove URLS (Regex \w+:\/\/\S+ matches all the URLs starting with http:// or https:// and replacing it with space.)
    text = re.sub("(\w+:\/\/\S+)", " ", text)
    text = re.sub(r'http\S+', ' ', text)
    
     # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove @users
    text = re.sub(r'@[\w]*', '', text)
    # remove Reddit channel reference /r
    text = re.sub(r'r/', '', text)
    
    # remove reddit username
    text = re.sub(r'u/[\w]*', '', text)
    # remove '&gt;' like notations
    text = re.sub('&\W*\w*\W*;', ' ', text)
    # remove hashtags
    text = re.sub(r'#[\w]*', '', text)
    ###################################################
    
    # Dealing with contractions
    text = contractions.fix(text)
    
    text = re.sub(r"what\'s", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can\'t", "can not ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"\'t", " not", text )
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"\'em'", " them ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    
    
    # Removes punctuations
    text = re.sub('['+string.punctuation+']', " ", text)
    
	# Removes non alphanumeric characters
    #text = re.sub('\W', ' ', text)
    
    # Removes non alphabetical characters
    text = re.sub('[^a-zA-Z]+', ' ', text)
    
    # Replaces all whitespaces by 1 whitespace
    text = re.sub('\s+', ' ', text)
    
    return text

In [15]:
df['text'] = df['text'].apply(clean_text)

In [16]:
df.head()

Unnamed: 0,text,emotions_anger,emotions_disgust,emotions_fear,emotions_guilt,emotions_joy,emotions_sadness,emotions_shame,labels
0,during the period of falling in love each time...,0,0,0,0,1,0,0,"[0, 0, 1, 0]"
1,when i was involved in a traffic accident,0,0,1,0,0,0,0,"[0, 1, 0, 0]"
2,when i was driving home after several days of ...,1,0,0,0,0,0,0,"[1, 0, 0, 0]"
3,when i lost the person who meant the most to me,0,0,0,0,0,1,0,"[0, 0, 0, 1]"
7,when i got a letter offering me the summer job...,0,0,0,0,1,0,0,"[0, 0, 1, 0]"


# **load model**

In [17]:
class MobileBERTClass(torch.nn.Module):
    def __init__(self):
        super(MobileBERTClass, self).__init__()
        self.l1 = MobileBertModel.from_pretrained("google/mobilebert-uncased")
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(512, 4)
    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [20]:
predictor = MobileBERTClass()

In [21]:
predictor.load_state_dict(torch.load('/content/drive/My Drive/mobileBert_demo_emotions_state_dict_11_14_0'), strict=False)
predictor.to(device)
predictor.eval()

MobileBERTClass(
  (l1): MobileBertModel(
    (embeddings): MobileBertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (embedding_transformation): Linear(in_features=384, out_features=512, bias=True)
      (LayerNorm): NoNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): MobileBertEncoder(
      (layer): ModuleList(
        (0): MobileBertLayer(
          (attention): MobileBertAttention(
            (self): MobileBertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): MobileBertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=T

# **save state dict**

In [None]:
torch.save(predictor.state_dict(), '/content/drive/My Drive/mobileBert_demo_emotions_state_dict_11_11')


In [None]:
demo_model  = MobileBERTClass()

In [None]:
demo_model.load_state_dict(torch.load('/content/drive/My Drive/mobileBert_demo_emotions_state_dict_11_11'), strict=False)

<All keys matched successfully>

# **param's size which is not the entire model size**

In [None]:
para = sum([np.prod(list(p.size())) for p in predictor.parameters()])

In [None]:
type_size = 4 # para type is float which is 4Byte
print('Model {} : params: {:4f}M'.format(predictor._get_name(), para * type_size / 1000 / 1000))

Model DistilBERTClass : params: 267.826192M


# **inference**

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [None]:
MAX_LEN = 100

In [None]:
pred = []
from tqdm import tqdm
sample = df.text.values[:10].tolist()


In [None]:
np.squeeze(df.text.values[:10])

array(['during the period of falling in love each time that we met and a especially when we had not met for a long time ',
       'when i was involved in a traffic accident ',
       'when i was driving home after several days of hard work there a was a motorist ahead of me who was driving at km hour and a refused despite his low speeed to let me overtake ',
       'when i lost the person who meant the most to me ',
       'when i got a letter offering me the summer job that i had applied a for ',
       'when i was going home alone one night in paris and a man came up a behind me and asked me if i was not afraid to be out alone so a late at night ',
       'when i was talking to him at a party for the first time in a long while and a friend came and interrupted us and he left ',
       'when my friends did not ask me to go to a new year party a with them ',
       ' on days when i feel close to my partner and other friends a when i feel at peace with myself and also experience a close

In [None]:
for text in sample:
  input = tokenizer.encode_plus(
              text,
              None,
              add_special_tokens=True,
              max_length=MAX_LEN,
              pad_to_max_length=True,
              return_token_type_ids=True
          )
  ids = torch.tensor([input['input_ids']], dtype=torch.long)
  mask = torch.tensor([input['attention_mask']], dtype=torch.long)
  token_type_ids = torch.tensor([input["token_type_ids"]], dtype=torch.long)
  # to device
  ids = ids.to(device, dtype = torch.long)
  mask = mask.to(device, dtype = torch.long)
  token_type_ids = token_type_ids.to(device, dtype = torch.long)

  output = demo_model(ids, mask, token_type_ids)
  print(torch.sigmoid(output).detach().numpy()[0].tolist())

[0.3786868155002594, 0.07880402356386185, 0.07205064594745636, 0.6136319041252136]
[0.02012174017727375, 0.9800737500190735, 0.0045004128478467464, 0.01189911738038063]
[0.6096121072769165, 0.46095141768455505, 0.004434571601450443, 0.0005510497721843421]
[0.17321108281612396, 0.003290484193712473, 0.018030032515525818, 0.694861114025116]
[0.23175214231014252, 0.1766984909772873, 0.47835275530815125, 0.14472025632858276]
[0.003492063842713833, 0.999444305896759, 0.0005993021768517792, 0.00016744121967349201]
[0.9701141119003296, 0.0012691541342064738, 0.0031542342621833086, 0.00466201500967145]
[0.9570013880729675, 0.0012602816568687558, 0.0008540928247384727, 0.07292445003986359]
[0.07093855738639832, 0.004388495348393917, 0.11629686504602432, 0.042725689709186554]
[0.012103703804314137, 0.9679816961288452, 0.04161408916115761, 0.32444265484809875]


In [None]:
sample

['during the period of falling in love each time that we met and a especially when we had not met for a long time ',
 'when i was involved in a traffic accident ',
 'when i was driving home after several days of hard work there a was a motorist ahead of me who was driving at km hour and a refused despite his low speeed to let me overtake ',
 'when i lost the person who meant the most to me ',
 'when i got a letter offering me the summer job that i had applied a for ',
 'when i was going home alone one night in paris and a man came up a behind me and asked me if i was not afraid to be out alone so a late at night ',
 'when i was talking to him at a party for the first time in a long while and a friend came and interrupted us and he left ',
 'when my friends did not ask me to go to a new year party a with them ',
 ' on days when i feel close to my partner and other friends a when i feel at peace with myself and also experience a close a contact with people whom i regard greatly ',
 'ever

# **make tensors and predict**

In [22]:
tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased", truncation=True, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [23]:
MAX_LEN = 100

In [24]:
predictor = torch.quantization.quantize_dynamic(predictor, {torch.nn.Linear}, dtype=torch.qint8)

In [25]:
pred = []
from tqdm import tqdm
for text in tqdm(df.text.values):
  input = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=MAX_LEN,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
  ids = torch.tensor([input['input_ids']], dtype=torch.long)
  mask = torch.tensor([input['attention_mask']], dtype=torch.long)
  token_type_ids = torch.tensor([input["token_type_ids"]], dtype=torch.long)
  # to device
  ids = ids.to(device, dtype = torch.long)
  mask = mask.to(device, dtype = torch.long)
  token_type_ids = token_type_ids.to(device, dtype = torch.long)

  output = predictor(ids, mask, token_type_ids)
  pred.append(torch.sigmoid(output).cpu().detach().numpy().tolist())

  0%|          | 0/4381 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 4381/4381 [10:14<00:00,  7.13it/s]


In [26]:
np.array(pred).shape

(4381, 1, 4)

In [27]:
pred_y = np.array(pred)

In [28]:
pred_y = pred_y.squeeze(axis=1)

In [29]:
pred_y.shape

(4381, 4)

In [30]:
pred_y

array([[0.17917693, 0.03613916, 0.20342577, 0.50775707],
       [0.09226484, 0.80423224, 0.00808373, 0.07934088],
       [0.24559473, 0.75554001, 0.00633112, 0.01405854],
       ...,
       [0.08061559, 0.93782622, 0.00199501, 0.00320755],
       [0.18536499, 0.00266788, 0.00181693, 0.95006055],
       [0.37422866, 0.42213547, 0.01259218, 0.08196951]])

In [31]:
real_y = df.labels.values.tolist()

In [32]:
real_y = np.array(real_y)

In [33]:
real_y = real_y.astype('int')

In [34]:
for i in range(len(pred_y)):
  for j in range(4):
    pred_y[i, j] =1 if pred_y[i, j] >= 0.5 else 0


In [35]:
pred_y = pred_y.astype('int')

# **overall metrics**

In [36]:
precision_score(real_y, pred_y, average='macro')

0.8757398003332307

In [37]:
recall_score(real_y, pred_y, average='macro')

0.7036648601288561

In [38]:
f1_score(real_y, pred_y, average='macro')

0.7676157660392859

# **precision score**

In [39]:
precision = [precision_score(real_y[:, 0], pred_y[:, 0]),
             precision_score(real_y[:, 1], pred_y[:, 1]),
             precision_score(real_y[:, 2], pred_y[:, 2]),
             precision_score(real_y[:, 3], pred_y[:, 3]),
             precision_score(real_y, pred_y, average='macro'),
             precision_score(real_y, pred_y, average='micro'),
             precision_score(real_y, pred_y, average='weighted')]

# **recall score**

In [40]:
recall = [recall_score(real_y[:, 0], pred_y[:, 0]),
          recall_score(real_y[:, 1], pred_y[:, 1]),
          recall_score(real_y[:, 2], pred_y[:, 2]),
          recall_score(real_y[:, 3], pred_y[:, 3]),
          recall_score(real_y, pred_y, average='macro'),
          recall_score(real_y, pred_y, average='micro'),
          recall_score(real_y, pred_y, average='weighted')]

# **f1 score**

In [41]:
f1 = [f1_score(real_y[:, 0], pred_y[:, 0]),
      f1_score(real_y[:, 1], pred_y[:, 1]),
      f1_score(real_y[:, 2], pred_y[:, 2]),
      f1_score(real_y[:, 3], pred_y[:, 3]),
      f1_score(real_y, pred_y, average='macro'),
      f1_score(real_y, pred_y, average='micro'),
      f1_score(real_y, pred_y, average='weighted')]

# **overall**

In [42]:
metric = pd.DataFrame([precision, recall, f1], columns=['anger', 'fear', 'joy', 'sadness', 'macro', 'micro', 'weighted'], index=['precision', 'recall','f1'])

In [43]:
metric

Unnamed: 0,anger,fear,joy,sadness,macro,micro,weighted
precision,0.715632,0.920173,0.972222,0.894932,0.87574,0.851422,0.875686
recall,0.831204,0.778995,0.543876,0.660584,0.703665,0.703721,0.703721
f1,0.769101,0.843719,0.697538,0.760105,0.767616,0.770557,0.76763
