In [1]:
#import keras_nlp
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers

import keras
import tensorflow as tf
import numpy as np
import pandas as pd
import nltk



In [2]:
print(tf.config.list_physical_devices())
# tf.config.set_visible_devices([tf.config.list_physical_devices('GPU')[0]])
# tf.debugging.set_log_device_placement(True)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# data = keras.utils.text_dataset_from_directory('learning-agency-lab-automated-essay-scoring-2/')
data = pd.read_csv("learning-agency-lab-automated-essay-scoring-2/train.csv")
essays = data.full_text.to_list()
labels = data.score.to_list() # maybe one hot and cross entropy

In [4]:
num_words=10000 # words that occur at least twice
training_size = 14000 # 80% of essays
padding_type='post'
trunc_type='post'

In [5]:
essays = [es[:2500] for es in essays] # cut the long essays

In [6]:
training_sentences = essays[0:training_size]
testing_sentences = essays[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [7]:
tokenizer = Tokenizer(num_words=num_words, oov_token='<oov>')

In [8]:
tokenizer.fit_on_texts(training_sentences)
print('tokenizer.word_index', len(tokenizer.word_index))
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenizer.word_index = {word: cnt for word, cnt in tokenizer.word_index.items() if word not in stop_words}
print('tokenizer.word_index', len(tokenizer.word_index))

tokenizer.word_index 57848
tokenizer.word_index 57676


[nltk_data] Downloading package stopwords to /Users/mojc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# sequences = tokenizer.texts_to_sequences(essays)
max_length = max([len(es) for es in essays])

In [10]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
training_padded = np.array(training_padded)
training_labels = np.array(training_labels) - 1
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels) - 1

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_words, 6),
    # layers.GlobalAveragePooling1D(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    layers.Dense(6, activation='softmax')]
)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

2024-09-23 21:44:29.633656: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3 Pro
2024-09-23 21:44:29.633673: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 36.00 GB
2024-09-23 21:44:29.633676: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 13.50 GB
2024-09-23 21:44:29.633691: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-23 21:44:29.633701: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
# with tf.device('/device:GPU:0'): # looking at the activity monitor it does not seem to make a difference
history = model.fit(training_padded, training_labels, epochs=50, validation_data=(testing_padded, testing_labels), verbose=2, callbacks=callback)
model.evaluate(testing_padded, testing_labels)
# accuracy: 0.0762 - loss: 0.4253

Epoch 1/50


2024-09-23 21:44:30.365049: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


438/438 - 78s - 177ms/step - accuracy: 0.3592 - loss: 1.4667 - val_accuracy: 0.3719 - val_loss: 1.4345
Epoch 2/50
438/438 - 76s - 173ms/step - accuracy: 0.3788 - loss: 1.3890 - val_accuracy: 0.4016 - val_loss: 1.3481
Epoch 3/50
438/438 - 75s - 171ms/step - accuracy: 0.4051 - loss: 1.3212 - val_accuracy: 0.3922 - val_loss: 1.3361
Epoch 4/50
438/438 - 75s - 172ms/step - accuracy: 0.4339 - loss: 1.2552 - val_accuracy: 0.4233 - val_loss: 1.2769
Epoch 5/50
438/438 - 75s - 172ms/step - accuracy: 0.4510 - loss: 1.2124 - val_accuracy: 0.4082 - val_loss: 1.3036
Epoch 6/50
438/438 - 76s - 173ms/step - accuracy: 0.4721 - loss: 1.1746 - val_accuracy: 0.4264 - val_loss: 1.2833
Epoch 7/50
438/438 - 75s - 171ms/step - accuracy: 0.4854 - loss: 1.1530 - val_accuracy: 0.4348 - val_loss: 1.2583
Epoch 8/50
438/438 - 76s - 172ms/step - accuracy: 0.4935 - loss: 1.1367 - val_accuracy: 0.4382 - val_loss: 1.2487
Epoch 9/50
438/438 - 75s - 172ms/step - accuracy: 0.5066 - loss: 1.1066 - val_accuracy: 0.4097 - va

[1.029724359512329, 0.5482310056686401]

In [13]:
# preds = np.round(model.predict(testing_padded))
preds = np.argmax(model.predict(testing_padded), axis=1)
print(np.mean(abs(preds - testing_labels)))
print(np.mean((np.round(preds, 0) - testing_labels)**2))

[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 68ms/step
0.48684608406410645
0.5582098578772301


In [14]:
# embedding with averagepooling
# 1.0258
# 1.7741

# embedding with bidirectional LSTM and removed stop words
# able to predict bigger range
# 1.0007
# 1.7187
# classification setup (predicting between 0 and 3 only though :/)
# accuracy: 0.5640 - loss: 1.0697
# 0.4780
# 0.5530
# second try
# 0.7314
# 1.0175

# bert_small_en_uncased -  predicting one value though
# 0.8415
# 1.1086