In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dropout, Conv1D, MaxPooling1D, GlobalAveragePooling1D, Dense
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.callbacks import LambdaCallback
from gensim.models import KeyedVectors
from IPython.display import FileLink
import pickle
import matplotlib.pyplot as plt

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
ds_train = pd.read_csv("/kaggle/input/amazon-sentiment-analysis/Train.csv")
ds_val = pd.read_csv("/kaggle/input/amazon-sentiment-analysis/Valid.csv")
ds_test = pd.read_csv("/kaggle/input/amazon-sentiment-analysis/Test.csv")

In [3]:
ds_train["Score"] -= 1
ds_val["Score"] -= 1
ds_test["Score"] -= 1

In [4]:
def take_subset(ds_train, ds_val, ds_test, percentage):
    return ds_train[:int(ds_train.shape[0] * percentage)], ds_val[:int(ds_val.shape[0] * percentage)], ds_test[:int(ds_test.shape[0] * percentage)]

In [5]:
ds_train, ds_val, ds_test = take_subset(ds_train, ds_val, ds_test, 0.1)

In [6]:
ds_train_val = pd.concat([ds_train, ds_val])

In [7]:
ds_train["Score"].value_counts() / len(ds_train["Score"])

4    0.640536
3    0.141041
0    0.093676
2    0.073115
1    0.051632
Name: Score, dtype: float64

In [8]:
ds_train["Score"].value_counts()

4    29129
3     6414
0     4260
2     3325
1     2348
Name: Score, dtype: int64

In [9]:
len(ds_train["Score"])

45476

In [10]:
ds_val["Score"].value_counts() / len(ds_val["Score"])

4    0.643737
3    0.140746
0    0.088142
2    0.078290
1    0.049085
Name: Score, dtype: float64

In [11]:
ds_val["Score"].value_counts()

4    3659
3     800
0     501
2     445
1     279
Name: Score, dtype: int64

In [12]:
len(ds_val["Score"])

5684

In [13]:
ds_test["Score"].value_counts() / len(ds_test["Score"])

4    0.634412
3    0.148487
0    0.092189
2    0.075827
1    0.049085
Name: Score, dtype: float64

In [14]:
ds_train["Text"][ds_train["Text"].isna() == True]

Series([], Name: Text, dtype: object)

In [15]:
ds_train.dropna(inplace = True)
ds_train_val.dropna(inplace = True)

In [16]:
ds_val["Text"][ds_train["Text"].isna() == True]

Series([], Name: Text, dtype: object)

In [17]:
ds_test["Text"][ds_train["Text"].isna() == True]

Series([], Name: Text, dtype: object)

In [18]:
def get_median_words_per_text(texts):
    num_words = [len(text.split()) for text in texts]
    return np.median(num_words)

In [19]:
median_words_per_text = get_median_words_per_text(ds_train["Text"])

In [20]:
median_words_per_text

28.0

In [21]:
num_of_texts = ds_train.shape[0]

In [22]:
num_of_texts / median_words_per_text

1624.142857142857

In [23]:
MAX_TOKENS = 20000

In [24]:
vectorizer = tf.keras.layers.TextVectorization(max_tokens = MAX_TOKENS)

In [25]:
vectorizer.adapt(ds_train_val['Text'])

In [26]:
vectorized_texts = vectorizer(ds_train_val['Text'])

In [27]:
vectorized_texts

<tf.Tensor: shape=(51160, 765), dtype=int64, numpy=
array([[15935,    31,   149, ...,     0,     0,     0],
       [ 1778,    12,     1, ...,     0,     0,     0],
       [    2,    80,  2121, ...,     0,     0,     0],
       ...,
       [   30,  5793,     3, ...,     0,     0,     0],
       [   12,   138,   122, ...,     0,     0,     0],
       [   14,     4,    90, ...,     0,     0,     0]])>

In [28]:
vectorized_texts.shape

TensorShape([51160, 765])

In [29]:
len(vectorizer.get_vocabulary())

20000

In [30]:
keyed_vectors = KeyedVectors.load_word2vec_format('/kaggle/input/googlenews-vectors-negative300/GoogleNews-vectors-negative300.bin', binary = True)

In [31]:
print(f"Found {len(keyed_vectors.index_to_key)} word vectors")

Found 3000000 word vectors


In [32]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [33]:
def get_embedding_matrix(word_index, keyed_vectors, embedding_dim):
    num_tokens = len(word_index)
    hits = 0
    misses = 0
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        try:
            embedding_vector = keyed_vectors[word][:embedding_dim]
            embedding_matrix[i] = embedding_vector
            hits += 1
        except:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix

In [34]:
def cnn_model(blocks, filters, kernel_size, embedding_dim, dropout_rate, pool_size, input_shape, num_classes, num_features, use_pretrained_embedding=False, is_embedding_trainable=False, embedding_matrix=None):
    model = Sequential()
    if use_pretrained_embedding:
        model.add(Embedding(input_dim = num_features,
                                        output_dim = embedding_dim,
                                        input_length = input_shape[0],
                                        weights = [embedding_matrix],
                                        trainable = is_embedding_trainable))
    else:
        model.add(Embedding(input_dim = num_features,
                            output_dim = embedding_dim,
                            input_length = input_shape[0]))

    for _ in range(blocks - 1):
        model.add(Dropout(rate = dropout_rate))
        model.add(Conv1D(filters = filters,
                                    kernel_size = kernel_size,
                                    activation = "relu",
                                    bias_initializer = "random_uniform",
                                    padding = "same"))
        model.add(Conv1D(filters = filters,
                                    kernel_size = kernel_size,
                                    activation = "relu",
                                    bias_initializer = "random_uniform",
                                    padding = "same"))
        model.add(MaxPooling1D(pool_size = pool_size))
    model.add(Conv1D(filters=filters * 2,
                      kernel_size=kernel_size,
                      activation='relu',
                      bias_initializer='random_uniform',
                      padding='same'))
    model.add(Conv1D(filters=filters * 2,
                      kernel_size=kernel_size,
                      activation='relu',
                      bias_initializer='random_uniform',
                      padding='same'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(rate=dropout_rate))
    model.add(Dense(num_classes, activation="softmax"))
    return model

In [36]:
def bidirectional_lstm_model(lstm_layers, units, embedding_dim, dropout_rate, input_shape, num_classes, num_features, use_pretrained_embedding=False, is_embedding_trainable=False, embedding_matrix=None):
    model = Sequential()
    if use_pretrained_embedding:
        model.add(Embedding(input_dim = num_features,
                            output_dim = embedding_dim,
                            input_length = input_shape[0],
                            weights = [embedding_matrix],
                            trainable = is_embedding_trainable))
    else:
        model.add(Embedding(input_dim = num_features,
                            output_dim = embedding_dim,
                            input_length = input_shape[0]))
    for _ in range(lstm_layers - 1):
        model.add(Dropout(rate = dropout_rate))
        model.add(Bidirectional(LSTM(units, return_sequences=True)))
    
    model.add(Bidirectional(LSTM(units)))
    model.add(Dropout(rate = dropout_rate))
    model.add(Dense(num_classes, activation="softmax"))
    return model

In [37]:
def train_sequence_model(data,
                         architechture = "cnn",
                         learning_rate=1e-3,
                         epochs=1000,
                         batch_size=128,
                         blocks=2,
                         filters=64,
                         dropout_rate=0.2,
                         embedding_dim=200,
                         kernel_size=3,
                         pool_size=3,
                         lstm_layers=2,
                         units=32,
                         use_pretrained_embedding=False,
                         is_embedding_trainable=False,
                         embedding_matrix=None,
                         logging = False):
    if embedding_matrix is not None:
        assert embedding_matrix.shape[1] == embedding_dim, "Embedding dimensions mismatch"
    (train_texts, train_labels), (val_texts, val_labels) = data
    num_features = MAX_TOKENS
    num_classes = 5
    x_train = vectorizer(train_texts)
    x_val = vectorizer(val_texts)
    output_sequences_length = x_train.shape[1]
    x_val = pad_sequences(x_val, maxlen = x_train.shape[1], padding = "post", truncating = "post")
    if architechture == "cnn":
        model = cnn_model(blocks=blocks,
                            filters=filters,
                            kernel_size=kernel_size,
                            embedding_dim=embedding_dim,
                            dropout_rate=dropout_rate,
                            pool_size=pool_size,
                            input_shape=x_train.shape[1:],
                            num_classes=num_classes,
                            num_features=num_features,
                            use_pretrained_embedding=use_pretrained_embedding,
                            is_embedding_trainable=is_embedding_trainable,
                            embedding_matrix=embedding_matrix)
    elif architechture == "bidirectional_lstm":
        model = bidirectional_lstm_model(lstm_layers=lstm_layers,
                                        units=units,
                                        embedding_dim=embedding_dim,
                                        dropout_rate=dropout_rate,
                                        input_shape=x_train.shape[1:],
                                        num_classes=num_classes,
                                        num_features=num_features,
                                        use_pretrained_embedding=use_pretrained_embedding,
                                        is_embedding_trainable=is_embedding_trainable,
                                        embedding_matrix=embedding_matrix)
    loss = 'sparse_categorical_crossentropy'
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
    if logging:
        val_accs = []

        def save_val_acc(epoch, logs):
            val_loss, val_acc = model.evaluate(x_val, val_labels, verbose=0)
            val_accs.append(val_acc)
        callback = LambdaCallback(on_batch_end=save_val_acc)
        # Train and validate model.
        history = model.fit(
                x_train,
                train_labels,
                epochs=epochs,
                validation_data=(x_val, val_labels),
                verbose=2,  # Logs once per epoch.
                batch_size=batch_size,
                callbacks=[callback])
    else:
        history = model.fit(
                x_train,
                train_labels,
                epochs=epochs,
                validation_data=(x_val, val_labels),
                verbose=2,  # Logs once per epoch.
                batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
    if logging:
        return model, val_accs, output_sequences_length
    else:
        return model, output_sequences_length

In [38]:
data = [(ds_train_val["Text"], np.array(ds_train_val["Score"])), (ds_test["Text"], np.array(ds_test["Score"]))]

In [39]:
embedding_dim = 200
embedding_matrix = get_embedding_matrix(word_index, keyed_vectors, embedding_dim)

Converted 14336 words (5664 misses)


In [40]:
%%time
model, output_sequences_length = train_sequence_model(data, epochs = 5, architechture = "cnn", use_pretrained_embedding=True, is_embedding_trainable=True, embedding_matrix=embedding_matrix)

Epoch 1/5
400/400 - 70s - loss: 1.0045 - acc: 0.6604 - val_loss: 0.8899 - val_acc: 0.6712 - 70s/epoch - 175ms/step
Epoch 2/5
400/400 - 36s - loss: 0.8197 - acc: 0.7013 - val_loss: 0.8253 - val_acc: 0.6988 - 36s/epoch - 90ms/step
Epoch 3/5
400/400 - 24s - loss: 0.7194 - acc: 0.7349 - val_loss: 0.8234 - val_acc: 0.7044 - 24s/epoch - 60ms/step
Epoch 4/5
400/400 - 16s - loss: 0.6379 - acc: 0.7619 - val_loss: 0.8342 - val_acc: 0.7069 - 16s/epoch - 40ms/step
Epoch 5/5
400/400 - 15s - loss: 0.5748 - acc: 0.7853 - val_loss: 0.9151 - val_acc: 0.7039 - 15s/epoch - 36ms/step
Validation accuracy: 0.7039057016372681, loss: 0.9151288270950317
CPU times: user 2min, sys: 3.7 s, total: 2min 4s
Wall time: 2min 48s


In [None]:
len(val_accs)

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(val_accs)
plt.xlabel("Gradient descent steps")
plt.ylabel("Validation accuracy")

In [None]:
model.summary()

In [None]:
with open("bidirectional_lstm_pretrained_val_accs.pkl", "wb") as f:
    pickle.dump(val_accs, f)

In [None]:
FileLink("bidirectional_lstm_pretrained_val_accs.pkl")

In [41]:
vectorizer = tf.keras.layers.TextVectorization(max_tokens = MAX_TOKENS, output_sequence_length = output_sequences_length)

In [42]:
vectorizer.adapt(ds_train_val["Text"])

In [43]:
string_input = tf.keras.Input(shape = (1, ), dtype = "string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = tf.keras.Model(string_input, preds)

In [44]:
 end_to_end_model.save('final_model.tf', save_format="tf")

In [45]:
!zip -r final_model.zip /kaggle/working/final_model.tf

  adding: kaggle/working/final_model.tf/ (stored 0%)
  adding: kaggle/working/final_model.tf/keras_metadata.pb (deflated 93%)
  adding: kaggle/working/final_model.tf/variables/ (stored 0%)
  adding: kaggle/working/final_model.tf/variables/variables.data-00000-of-00001 (deflated 6%)
  adding: kaggle/working/final_model.tf/variables/variables.index (deflated 62%)
  adding: kaggle/working/final_model.tf/saved_model.pb (deflated 77%)
  adding: kaggle/working/final_model.tf/fingerprint.pb (stored 0%)
  adding: kaggle/working/final_model.tf/assets/ (stored 0%)


In [46]:
FileLink('final_model.zip')