In [3]:
#import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [4]:
#get the model
model = TFBertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels = 5)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model.summary()

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  177853440 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 177,857,285
Trainable params: 177,857,285
Non-trainable params: 0
_________________________________________________________________


In [5]:
#read the data
t = pd.read_csv('EmotionTrain.csv', encoding = 'utf-8')
test = pd.read_csv('EmotionTest.csv', encoding = 'utf-8')

In [6]:
t.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,Tweet,Emotions
0,0,1,2017-En-31535,"Ինչ էլ որ որոշեք անել, համոզվեք, որ դա ձեզ #եր...",joy
1,1,3,2017-En-31436,"Ընդունեք մարտահրավերները, որպեսզի բառացիորեն ն...",joy
2,2,4,2017-En-22195,"Իմ սենյակակից. լավ է, որ մենք չենք կարող ուղղա...",anger
3,3,5,2017-En-22190,"Ոչ, բայց դա այնքան գեղեցիկ է: Աթսուն, հավանաբա...",joy
4,4,7,2017-En-22180,"Ռունիսն անձեռնմխելի է, չէ՞: Դարձյալ սարսափելի ...",anger


In [7]:
#create a polarity column and change the targets accordingly
def change_label(data):
    data['Polarity'] = np.nan
    for i in range(len(data)):
        if data['Emotions'][i] == 'anger':
            data['Polarity'][i] = 0
        elif data['Emotions'][i] == 'fear':
            data['Polarity'][i] = 1
        elif data['Emotions'][i] == 'joy':
            data['Polarity'][i] = 2
        elif data['Emotions'][i] == 'sadness':
            data['Polarity'][i] = 3
        else:
            data['Polarity'][i] = 4
    data = data.drop('Emotions', axis = 1)
    return data

In [8]:
t = change_label(t)
test = change_label(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the do

In [9]:
#split data into train, validate
train, dev = train_test_split(t, test_size = 0.1)

In [10]:
#define the functions
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, #globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, #globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
    return train_InputExamples, validation_InputExamples
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] #will hold InputFeatures to be converted later

    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens = True,
            max_length = max_length, #truncates if len(s) > max_length
            return_token_type_ids = True,
            return_attention_mask = True,
            pad_to_max_length = True, #pads to the right by default
            truncation = True
        )

        input_ids, token_type_ids, attention_mask = (input_dict['input_ids'],
            input_dict['token_type_ids'], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, label = e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    'input_ids': f.input_ids,
                    'attention_mask': f.attention_mask,
                    'token_type_ids': f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32}, tf.int64),
        (
            {
                'input_ids': tf.TensorShape([None]),
                'attention_mask': tf.TensorShape([None]),
                'token_type_ids': tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'Tweet'
LABEL_COLUMN = 'Polarity'

In [11]:
#train the model on the dataset
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, dev, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 3e-5), 
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), 
              metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs = 1, validation_data = validation_data, verbose = 2)



232/232 - 388s - loss: 1.0086 - accuracy: 0.6140 - val_loss: 0.8814 - val_accuracy: 0.6805 - 388s/epoch - 2s/step


<keras.callbacks.History at 0x7f85e8c8d310>

In [12]:
#predict
pred_sentences = test['Tweet']
tf_batch = tokenizer(list(pred_sentences), max_length = 128, padding = True, truncation = True, return_tensors = 'tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis = -1)
labels = [0, 1, 2, 3, 4]
label = tf.argmax(tf_predictions, axis = 1)
label = label.numpy()
predictions = pd.Series(label, index = test.index)
print('Accuracy:', accuracy_score(test['Polarity'], predictions))
print('F-measure:', f1_score(test['Polarity'], predictions, average = 'weighted'))
print('Recall:', recall_score(test['Polarity'], predictions, average = 'weighted'))
print('Precision:', precision_score(test['Polarity'], predictions, average = 'weighted'))

Accuracy: 0.6877256317689531
F-measure: 0.6937330044574023
Recall: 0.6877256317689531
Precision: 0.7178214325558011


  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from sklearn.metrics.cluster import contingency_matrix
contingency_matrix(test['Polarity'], predictions)

array([[ 98,  14,  16,   5],
       [  8,  29,   3,   3],
       [ 37,  12, 208,  37],
       [ 25,   0,   7,  46],
       [  4,   0,   1,   1]])

In [16]:
print('F-measure:', f1_score(test['Polarity'], predictions, average = None))
print('Recall:', recall_score(test['Polarity'], predictions, average = None))
print('Precision:', precision_score(test['Polarity'], predictions, average = None))

F-measure: [0.64262295 0.59183673 0.78638941 0.54117647 0.        ]
Recall: [0.73684211 0.6744186  0.70748299 0.58974359 0.        ]
Precision: [0.56976744 0.52727273 0.88510638 0.5        0.        ]


  _warn_prf(average, modifier, msg_start, len(result))
