In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
import json
import pandas as pd
from tqdm import tqdm


import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, AdamW, InputExample, InputFeatures

In [24]:
conv_ids = []
utterances = []
emotions = []
speakers = []
with open('/content/drive/MyDrive/Colab Notebooks/IRTMtask0_bert/train_dataset.json') as f:
    data = json.load(f)
    for conv_id in data['conversation']:
        for utterance in data['conversation'][conv_id]:
            conv_ids.append(conv_id)
            utterances.append(utterance['text'])
            speakers.append(utterance['speaker'])
            emotions.append(utterance['emotion'])

train_df = pd.DataFrame({'conv_id': conv_ids, 'utterance': utterances, 'speaker': speakers, 'emotion': emotions})
train_df.head()

Unnamed: 0,conv_id,utterance,speaker,emotion
0,535,Oh hey Joey ! What up ?,Phoebe,joy
1,535,I can not decide which route to take to Vegas ...,Joey,neutral
2,535,"Yeah , I have been around .",Phoebe,neutral
3,535,"Okay , so ... so which route should I take the...",Joey,neutral
4,535,"Ooh , if you take the northern route there is ...",Phoebe,neutral


In [25]:
conv_ids = []
utterances = []
emotions = []
speakers = []
with open('/content/drive/MyDrive/Colab Notebooks/IRTMtask0_bert/test_dataset.json') as f:
    data = json.load(f)
    for conv_id in data['conversation']:
        for utterance in data['conversation'][conv_id]:
            conv_ids.append(conv_id)
            utterances.append(utterance['text'])
            speakers.append(utterance['speaker'])
            emotions.append(utterance['emotion'])

test_df = pd.DataFrame({'conv_id': conv_ids, 'utterance': utterances, 'speaker': speakers, 'emotion': emotions})
test_df.head()

Unnamed: 0,conv_id,utterance,speaker,emotion
0,430,Hey !,Joey,joy
1,430,"So , what are you guys in the market for ? We ...",The Vendor,neutral
2,430,Check this out ? Huh ? Yeah . That is the stuf...,Joey,joy
3,430,"Well , I do not have to buy that , "" I am with...",Chandler,neutral
4,430,"Well , I like it . Here you go .",Joey,neutral


In [26]:
emotion_map = {'anger': 0, 'disgust': 1, 'fear': 2, 'joy': 3, 'sadness': 4, 'surprise': 5, 'neutral': 6}
train_df['emotion_label'] = train_df['emotion'].map(emotion_map)
test_df['emotion_label'] = test_df['emotion'].map(emotion_map)

In [27]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=7)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
def convert_data_to_examples(train, test, utt, emotion):
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[utt],
                                                          label = x[emotion]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[utt],
                                                          label = x[emotion]), axis = 1,)

    return train_InputExamples, validation_InputExamples

train_InputExamples, test_InputExamples = convert_data_to_examples(train_df,  test_df, 'utterance',  'emotion_label')

In [32]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)
test_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer)
test_data = test_data.shuffle(100).batch(32).repeat(2)

100%|██████████| 10246/10246 [00:03<00:00, 3218.71it/s]
100%|██████████| 3373/3373 [00:01<00:00, 1788.23it/s]


In [33]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=test_data)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7ea04ab34d30>