In [10]:
import os


train_neg_path = "data/aclImdb/train/neg"
train_pos_path = "data/aclImdb/train/neg"


def read_texts(path):
    text_lst = []
    for filename in os.listdir(path):
        if filename[-4:] == '.txt':
            with open(os.path.join(train_neg_path, filename), 'rt') as file:
                text_lst.append(file.read())
    return text_lst


train_neg_texts = read_texts(train_neg_path)
train_neg_labels = [0] * len(train_neg_texts)
train_pos_texts = read_texts(train_pos_path)
train_pos_labels = [1] * len(train_pos_texts)

train_texts = train_neg_texts + train_pos_texts
train_labels = train_neg_labels + train_pos_labels

In [42]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


N_words = 10000
N_tokens = 100
N_training = 200
N_validation = 10000

tokenizer = Tokenizer(num_words=N_words)
tokenizer.fit_on_texts(train_texts)
sequences = tokenizer.texts_to_sequences(train_texts)
word_index_dct = tokenizer.word_index
X = pad_sequences(sequences, maxlen=N_tokens)
y = np.array(train_labels)

indices = np.arange(X.shape[0])
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

X_train = X[:N_training]
y_train = y[:N_training]
X_val = X[N_training: N_training + N_validation]
y_val = y[N_training: N_training + N_validation]

In [34]:
import numpy as np


glove_dir = 'data/glove'

embeddings_index_dct = {}
with open(os.path.join(glove_dir, 'glove.6B.100d.txt'), 'rt') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype=np.float32)
        embeddings_index_dct[word] = coeffs

In [43]:
N_embedding_dim = 100
embedding_matrix = np.zeros((N_words, N_embedding_dim))
for word, word_index in word_index_dct.items():
    if word_index <= N_words and word in embeddings_index_dct:
        embedding_matrix[word_index-1] = embeddings_index_dct[word]

In [45]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense


model = Sequential()
model.add(Embedding(N_words, N_embedding_dim, input_length=N_tokens))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

2023-12-06 20:07:53.380875: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-12-06 20:07:53.381102: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-12-06 20:07:53.381116: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-12-06 20:07:53.381406: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-06 20:07:53.381827: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 flatten (Flatten)           (None, 10000)             0         
                                                                 
 dense (Dense)               (None, 32)                320032    
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1320065 (5.04 MB)
Trainable params: 1320065 (5.04 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [46]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [47]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])

history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val))

Epoch 1/10


2023-12-06 20:12:43.856120: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [48]:
model.save_weights('pre_trained_glove_model.h5')