In [1]:
from datasets import load_dataset
import random
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
import tensorflow as tf

In [2]:
# Load the dataset
dataset = load_dataset("silicone", "maptask")

Found cached dataset silicone (C:/Users/ilyes/.cache/huggingface/datasets/silicone/maptask/1.0.0/af617406c94e3f78da85f7ea74ebfbd3f297a9665cb54adbae305b03bc4442a5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
# Separate the dataset into train val and test
train_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

# Extract the utterances and labels
train_X = train_dataset["Utterance"]
train_y = train_dataset["Label"]

val_X = val_dataset["Utterance"]
val_y = val_dataset["Label"]

test_X = test_dataset["Utterance"]
test_y = test_dataset["Label"]

In [4]:
len(train_X),len(train_y)

(20905, 20905)

In [5]:
# Tokenize the text data
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(train_X)

from transformers import AutoTokenizer, TFBertModel 

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

# Tokenize the data
train_encodings = tokenizer(train_X, truncation=True, padding=True)
val_encodings = tokenizer(val_X, truncation=True, padding=True)
test_encodings = tokenizer(test_X, truncation=True, padding=True)

# Pad val_encodings and test_encodings to 124
val_encodings['input_ids'] = tf.keras.preprocessing.sequence.pad_sequences(val_encodings['input_ids'], maxlen=124, dtype="long", value=0, truncating="post", padding="post")
# val_encodings['attention_mask'] = tf.keras.preprocessing.sequence.pad_sequences(val_encodings['attention_mask'], maxlen=124, dtype="long", value=0, truncating="post", padding="post")

test_encodings['input_ids'] = tf.keras.preprocessing.sequence.pad_sequences(test_encodings['input_ids'], maxlen=124, dtype="long", value=0, truncating="post", padding="post")
# test_encodings['attention_mask'] = tf.keras.preprocessing.sequence.pad_sequences(test_encodings['attention_mask'], maxlen=124, dtype="long", value=0, truncating="post", padding="post")

# Convert the data to tf dataset
# train_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(train_encodings),
#     train_y
# ))

# val_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(val_encodings),
#     val_y
# ))

# test_dataset = tf.data.Dataset.from_tensor_slices((
#     dict(test_encodings),
#     test_y
# ))

train_data = train_encodings['input_ids']
val_data = val_encodings['input_ids']
test_data = test_encodings['input_ids']

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [6]:
# Convert text data to sequences of integers
# train_sequences = tokenizer.texts_to_sequences(train_X)
# test_sequences = tokenizer.texts_to_sequences(test_X)

In [7]:
# Pad sequences to have the same length
# max_sequence_length = max(len(seq) for seq in train_sequences + test_sequences)
# train_data_ = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
# test_data_ = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

max_sequence_length = 124

In [8]:
# Convert array to list
# train_data = train_data_.tolist()
# test_data = test_data_.tolist()
train_data = np.asarray(train_data)
val_data = np.asarray(val_data)
test_data = np.asarray(test_data)

train_y = np.asarray(train_y)
val_y = np.asarray(val_y)
test_y = np.asarray(test_y)

In [9]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [10]:
# Load pre-trained GloVe embeddings
glove_path = "glove.6B.100d.txt"
# Update with the path to your GloVe file
embedding_dim = 100  # The dimensionality of the GloVe embeddings
embeddings_index = {}
with open(glove_path) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [11]:
# Create an embedding matrix for the words in the dataset
# word_index = tokenizer.word_index
word_index = tokenizer.get_vocab()
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [12]:
# Build the model
model = Sequential()
model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(12, activation='softmax'))

In [13]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(train_data, train_y, validation_data=(test_data, test_y), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x229d0726490>

In [15]:
# Evaluate the model
y_pred = model.predict(test_data)
y_pred_labels = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(test_y, y_pred_labels)
report = classification_report(test_y, y_pred_labels)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.62
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.70      0.69       659
           1       0.61      0.31      0.41       151
           2       0.38      0.43      0.41       254
           3       0.15      0.04      0.07       162
           4       0.48      0.74      0.58       223
           5       0.67      0.83      0.74       463
           6       0.85      0.73      0.79        86
           7       0.72      0.68      0.70       173
           8       0.74      0.49      0.59       191
           9       0.68      0.85      0.76        96
          10       0.33      0.07      0.11       102
          11       0.68      0.77      0.72       334

    accuracy                           0.62      2894
   macro avg       0.58      0.55      0.55      2894
weighted avg       0.60      0.62      0.60      2894

