In [None]:
! pip install transformers==3.0.2

Collecting transformers==3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 12.6MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 29.9MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 47.3MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64

In [None]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/00/92/a05b76a692ac08d470ae5c23873cf1c9a041532f1ee065e74b374f218306/contractions-0.0.25-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 13.9MB/s 
[?25hCollecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 24.6MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
 

In [None]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
from tqdm import tqdm
logging.basicConfig(level=logging.ERROR)
# clean text
import re
import contractions
import unicodedata
import string

In [None]:
torch.__version__

'1.7.0+cu101'

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
!lscpu

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              2
On-line CPU(s) list: 0,1
Thread(s) per core:  2
Core(s) per socket:  1
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) CPU @ 2.00GHz
Stepping:            3
CPU MHz:             2000.140
BogoMIPS:            4000.28
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            39424K
NUMA node0 CPU(s):   0,1
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_si

# **load and preprocess the dataframe from isear**

In [None]:
df = pd.read_csv('/content/drive/My Drive/isear_processed_emotions.csv')

In [None]:
df = pd.get_dummies(df, columns=['emotions'])

In [None]:
df['labels'] = df[['emotions_anger', 'emotions_fear', 'emotions_joy', 'emotions_sadness']].values.astype('int').tolist()

In [None]:
df = df[(df['emotions_anger']==1) | (df['emotions_fear']==1) | (df['emotions_joy']==1) | (df['emotions_sadness']==1)]

In [None]:
def clean_text(text):
    """
    Function to clean text with basic steps - lower casing, dealing with contractions, remove html codes,
    strip whitespaces, social media cleaning (remove hashtags and URLS), remove punctuationns, using regular expressions.
 
    Parameters
    ----------
    text : str
            Text to be cleaned
    
    Returns
    -------
    text : str
            Cleaned text
    """
    # Lower casing
    text = text.lower()
    
    
    # Remove html codes
    text = re.sub(r"&amp;", " ", text)
    text = re.sub(r"&quot;", " ", text)
    text = re.sub(r"&#39;", " ", text)
    text = re.sub(r"&gt;", " ", text)
    text = re.sub(r"&lt;", " ", text)
    
    # Strips (removes) whitespaces
    text = text.strip(' ')
    
    ################ Social media cleaning ############
    
    # Remove hashtags (Regex @[A-Za-z0-9]+ represents mentions and #[A-Za-z0-9]+ represents hashtags. )
    text = re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", text)
    
    # Remove URLS (Regex \w+:\/\/\S+ matches all the URLs starting with http:// or https:// and replacing it with space.)
    text = re.sub("(\w+:\/\/\S+)", " ", text)
    text = re.sub(r'http\S+', ' ', text)
    
     # remove old style retweet text "RT"
    text = re.sub(r'^RT[\s]+', '', text)
    # remove accents
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # remove @users
    text = re.sub(r'@[\w]*', '', text)
    # remove Reddit channel reference /r
    text = re.sub(r'r/', '', text)
    
    # remove reddit username
    text = re.sub(r'u/[\w]*', '', text)
    # remove '&gt;' like notations
    text = re.sub('&\W*\w*\W*;', ' ', text)
    # remove hashtags
    text = re.sub(r'#[\w]*', '', text)
    ###################################################
    
    # Dealing with contractions
    text = contractions.fix(text)
    
    text = re.sub(r"what\'s", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can\'t", "can not ", text)
    text = re.sub(r"n\'t", " not ", text)
    text = re.sub(r"\'t", " not", text )
    text = re.sub(r"i\'m", "i am ", text)
    text = re.sub(r"\'em'", " them ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    
    
    # Removes punctuations
    text = re.sub('['+string.punctuation+']', " ", text)
    
	# Removes non alphanumeric characters
    #text = re.sub('\W', ' ', text)
    
    # Removes non alphabetical characters
    text = re.sub('[^a-zA-Z]+', ' ', text)
    
    # Replaces all whitespaces by 1 whitespace
    text = re.sub('\s+', ' ', text)
    
    return text

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
df = df[["text", "labels"]]

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

Unnamed: 0,text,labels
0,during the period of falling in love each time...,"[0, 0, 1, 0]"
1,when i was involved in a traffic accident,"[0, 1, 0, 0]"
2,when i was driving home after several days of ...,"[1, 0, 0, 0]"
3,when i lost the person who meant the most to me,"[0, 0, 0, 1]"
4,when i got a letter offering me the summer job...,"[0, 0, 1, 0]"


# **load and preproecss the dataframe test from goemotions**

In [None]:
data = pd.read_csv('/content/drive/My Drive/goemotions_aug_dairai_test_cleaned.csv')


In [None]:
data.drop(['Unnamed: 0', 'datasource'], inplace=True, axis=1)
df = pd.DataFrame()
df['text'] = data['cleaned_processed']
df['labels'] = data[['anger', 'fear', 'joy', 'sadness']].values.astype('int').tolist()

In [None]:
df.head()

Unnamed: 0,text,labels
0,i love to talk about myself i am sure there is...,"[0, 0, 1, 0]"
1,i mentioned above about how shia are feeling i...,"[0, 1, 0, 0]"
2,i can not tell if you got the joke or not hahaha,"[0, 0, 1, 0]"
3,i have come out on the other side of this thin...,"[1, 0, 0, 0]"
4,i got some fabulous answer and i could also fe...,"[0, 0, 1, 0]"


In [None]:
len(df)

42979

# **load model**

In [None]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.5)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
predictor = DistilBERTClass()
predictor.load_state_dict(torch.load('/content/drive/MyDrive/distilbert_demo_emotions_state_dict_11_17_epoch2'), strict=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




<All keys matched successfully>

In [None]:
predictor.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_featu

***quantize the model***

In [None]:
predictor = torch.quantization.quantize_dynamic(predictor, {torch.nn.Linear}, dtype=torch.qint8)

# **get tokenizer and set MAX_LEN**

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
MAX_LEN = 100

# **make tensors and predict**

In [None]:
predictor.eval()
pred_loop = []
for text in tqdm(df.text.values):
    input = tokenizer.encode_plus(
                text,
                None,
                add_special_tokens=True,
                max_length=MAX_LEN,
                pad_to_max_length=True,
                return_token_type_ids=True
            )
    ids = torch.tensor([input['input_ids']], dtype=torch.long)
    mask = torch.tensor([input['attention_mask']], dtype=torch.long)
    token_type_ids = torch.tensor([input["token_type_ids"]], dtype=torch.long)
    # to device
    ids = ids.to(device, dtype = torch.long)
    mask = mask.to(device, dtype = torch.long)
    token_type_ids = token_type_ids.to(device, dtype = torch.long)

    output = predictor(ids, mask, token_type_ids)
    pred_loop.append(torch.sigmoid(output).cpu().detach().numpy().tolist())

100%|██████████| 42979/42979 [06:43<00:00, 106.53it/s]


# **using dataloader instead of loop**

In [None]:
class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = texts
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = self.text[index]

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [None]:
data_set = InferenceDataset(df['text'], tokenizer, MAX_LEN)
data_params = {'batch_size': 64,
                'shuffle': False,
                'num_workers': 0
                }
data_loader = DataLoader(data_set, **data_params)

In [None]:

predictor.eval()
pred_dataloader=[]
with torch.no_grad():
    for _, data in tqdm(enumerate(data_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        outputs = predictor(ids, mask, token_type_ids)
        pred_dataloader.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())


69it [07:58,  6.93s/it]


In [None]:
def inference_multiprocessing(df, model):
    model.eval()
    pred_loop_multiprocessing = []
    for text in tqdm(df.text.values):
        input = tokenizer.encode_plus(
                    text,
                    None,
                    add_special_tokens=True,
                    max_length=MAX_LEN,
                    pad_to_max_length=True,
                    return_token_type_ids=True
                )
        ids = torch.tensor([input['input_ids']], dtype=torch.long)
        mask = torch.tensor([input['attention_mask']], dtype=torch.long)
        token_type_ids = torch.tensor([input["token_type_ids"]], dtype=torch.long)
        # to device
        ids = ids.to(device, dtype = torch.long)
        mask = mask.to(device, dtype = torch.long)
        token_type_ids = token_type_ids.to(device, dtype = torch.long)

        output = model(ids, mask, token_type_ids)
        pred_loop_multiprocessing.append(torch.sigmoid(output).cpu().detach().numpy().tolist())    
    return pred_loop_multiprocessing

In [None]:
from torch.multiprocessing import Pool, set_start_method
from fastai.vision import *
from fastai.text import *

In [None]:
n_processes = 2
set_start_method('spawn', force=True)
try:
    pool = Pool(n_processes)
    pool.map(inference_multiprocessing, (df, predictor))
except Exception as e:
    print('Main Pool Error: ', e)
except KeyboardInterrupt:
    exit()
finally:
    pool.terminate()
    pool.join()

# **post_precessing**

In [None]:
pred_y = np.array(pred_loop)

In [None]:
pred_y = pred_y.squeeze(1)
pred_y.shape

(42979, 4)

In [None]:
pred_y

array([[1.53913898e-05, 2.61696969e-05, 9.99723971e-01, 2.02845913e-05],
       [1.25038248e-04, 9.99926209e-01, 1.50661625e-04, 2.48189375e-04],
       [1.65531164e-05, 1.20948635e-05, 9.99314427e-01, 3.45288572e-05],
       ...,
       [3.67076318e-05, 4.17121737e-05, 8.08818579e-01, 6.72210299e-05],
       [2.21735136e-05, 7.60762850e-05, 9.24473934e-05, 9.99900579e-01],
       [1.17551856e-04, 9.99932408e-01, 1.71797452e-04, 1.62186625e-04]])

In [None]:
df['labels']

0        [0, 0, 1, 0]
1        [0, 1, 0, 0]
2        [0, 0, 1, 0]
3        [1, 0, 0, 0]
4        [0, 0, 1, 0]
             ...     
42974    [0, 0, 0, 1]
42975    [0, 0, 0, 0]
42976    [0, 0, 0, 0]
42977    [0, 0, 0, 1]
42978    [0, 1, 0, 0]
Name: labels, Length: 42979, dtype: object

In [None]:
real_y = df.labels.values.tolist()

In [None]:
real_y = np.array(real_y)

In [None]:
real_y = real_y.astype('int')

In [None]:
for i in range(len(pred_y)):
  for j in range(4):
    pred_y[i, j] =1 if pred_y[i, j] >= 0.5 else 0


In [None]:
pred_y = pred_y.astype('int')

# **overall metrics**

In [None]:
precision_score(real_y, pred_y, average='macro')

0.9467784654088982

In [None]:
recall_score(real_y, pred_y, average='macro')

0.9382485129657372

In [None]:
f1_score(real_y, pred_y, average='macro')

0.9421958243100784

# **precision score**

In [None]:
precision = [precision_score(real_y[:, 0], pred_y[:, 0]),
             precision_score(real_y[:, 1], pred_y[:, 1]),
             precision_score(real_y[:, 2], pred_y[:, 2]),
             precision_score(real_y[:, 3], pred_y[:, 3]),
             precision_score(real_y, pred_y, average='macro'),
             precision_score(real_y, pred_y, average='micro'),
             precision_score(real_y, pred_y, average='weighted')]

# **recall score**

In [None]:
recall = [recall_score(real_y[:, 0], pred_y[:, 0]),
          recall_score(real_y[:, 1], pred_y[:, 1]),
          recall_score(real_y[:, 2], pred_y[:, 2]),
          recall_score(real_y[:, 3], pred_y[:, 3]),
          recall_score(real_y, pred_y, average='macro'),
          recall_score(real_y, pred_y, average='micro'),
          recall_score(real_y, pred_y, average='weighted')]

# **f1 score**

In [None]:
f1 = [f1_score(real_y[:, 0], pred_y[:, 0]),
      f1_score(real_y[:, 1], pred_y[:, 1]),
      f1_score(real_y[:, 2], pred_y[:, 2]),
      f1_score(real_y[:, 3], pred_y[:, 3]),
      f1_score(real_y, pred_y, average='macro'),
      f1_score(real_y, pred_y, average='micro'),
      f1_score(real_y, pred_y, average='weighted')]

# **overall**

In [None]:
metric = pd.DataFrame([precision, recall, f1], columns=['anger', 'fear', 'joy', 'sadness', 'macro', 'micro', 'weighted'], index=['precision', 'recall','f1'])

In [None]:
metric

Unnamed: 0,anger,fear,joy,sadness,macro,micro,weighted
precision,0.972237,0.989133,0.838252,0.987491,0.946778,0.930042,0.932113
recall,0.917689,0.974437,0.876476,0.984392,0.938249,0.933295,0.933295
f1,0.944176,0.98173,0.856938,0.985939,0.942196,0.931666,0.932409
