In [4]:
import matplotlib.pyplot as plt
import tensorflow as tf
import zipfile
import pandas as pd
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [5]:
import matplotlib.pyplot as plt
def plotacc(history):
    history_dict = history.history
    loss_values = history_dict["loss"]
    val_loss_values = history_dict["val_loss"]
    epochs = range(1, len(loss_values) + 1)
    plt.plot(epochs, loss_values, "bo", label="Training loss")
    plt.plot(epochs, val_loss_values, "b", label="Validation loss")
    plt.title("Training and validation loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()
    plt.clf()
    acc = history_dict["accuracy"]
    val_acc = history_dict["val_accuracy"]
    plt.plot(epochs, acc, "bo", label="Training acc")
    plt.plot(epochs, val_acc, "b", label="Validation acc")
    plt.title("Training and validation accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()

### Model_0 Benchmark(BOW)

In [None]:
# Loading the dataset from the 'train' directory

batch_size = 128
seed = 1337 # Keep the seed same for both 'train' & 'validation' to avoid overlap

train_ds = keras.preprocessing.text_dataset_from_directory(
    "train", 
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.2,
    subset='training',
    seed=seed)

val_ds = keras.preprocessing.text_dataset_from_directory(
    "train",
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.2,
    subset='validation',
    seed=seed)

text_only_train_ds = train_ds.map(lambda x, y: x)

In [None]:
# Create a TextVectorization instance using 2-grams and 'count' mode
# Note 'text_vectorization' can also be used a keras layer
# We will use this during the prediction on test data

# max_length = 50
max_tokens = 20000
text_vectorization = TextVectorization(
    ngrams=2,
    output_mode="count",
    max_tokens=max_tokens,
)

# Fit it on the train dataset
text_vectorization.adapt(text_only_train_ds)

# Map the vocabulary on the 'train' and 'validation' sets

count_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
count_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))


In [None]:
inputs = keras.Input(shape=(max_tokens,))
x = layers.Dense(256, activation="relu")(inputs)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=2),
    keras.callbacks.ModelCheckpoint("bow_2grams_1.keras",
                                    save_best_only=True)
]

In [None]:
# Train the model and use validation ds for early stopping and model saving

history_0 = model.fit(count_train_ds,validation_data = count_val_ds, epochs=5, callbacks=callbacks)
model = keras.models.load_model("bow_2grams_1.keras")
print(f"Test acc: {model.evaluate(count_val_ds)[1]:.3f}")

In [None]:
plotacc(history_0)

In [None]:
# Using the trained model to make prediction on unseen (test) data
# Here we use the 'adapted' text_vectorization layer and include it as part of a prediction_model

prediction_model = tf.keras.Sequential(
    [text_vectorization, model])

prediction_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])

# Test it with `val_ds`, which yields raw strings
loss, accuracy = prediction_model.evaluate(val_ds)
print("Accuracy: {:2.2%}".format(accuracy))

In [None]:
# Read the test data in the form of a dataframe

df_test_data = pd.read_csv('data_v2/data_test_df.csv')
inputs = df_test_data['data']

In [None]:
# Make sure you use the 'prediction_model' and not the trained 'model' alone
# If you use the 'model' object, you will run int error as the data is still in the 'text' format and needs vectorization

predicted_scores = prediction_model.predict(inputs)
predicted_scores[0:5]

### Model_1 Sequence model_1 Date8.12

In [6]:
# Loading the dataset from the 'train' directory

batch_size = 128
seed = 1337 # Keep the seed same for both 'train' & 'validation' to avoid overlap

train_ds = keras.preprocessing.text_dataset_from_directory(
    "../input/hw2-ycbs-273-intro-to-prac-ml/train", 
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.2,
    subset='training',
    seed=seed)

val_ds = keras.preprocessing.text_dataset_from_directory(
    "../input/hw2-ycbs-273-intro-to-prac-ml/train",
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.2,
    subset='validation',
    seed=seed)

text_only_train_ds = train_ds.map(lambda x, y: x)

Found 120000 files belonging to 4 classes.
Using 96000 files for training.
Found 120000 files belonging to 4 classes.
Using 24000 files for validation.


In [None]:
max_length = 100
max_tokens = 20000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

# Fit it on the train dataset
text_vectorization.adapt(text_only_train_ds)

# Map the vocabulary on the 'train' and 'validation' sets

count_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
count_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))

In [None]:
# Printing few samples of the raw data

for text_batch, label_batch in train_ds.take(1):
    for i in range(5):
        print("News: ", text_batch.numpy()[i])
        print("Label:", label_batch.numpy()[i])

In [None]:
# Retrieve a batch (of 512 news and labels) from the dataset and printing 1 sample

text_batch, label_batch = next(iter(train_ds))
first_news, first_label = text_batch[0], label_batch[0]
print("News", first_news)
print("Label", first_label)

In [None]:
# Helper function for using 'text_vectorization'
def count_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return text_vectorization(text), label

In [None]:
# Printing out vectorized text data using 'text_vectorization' layer
print("'count' vectorized question:",
      count_vectorize_text(first_news, first_label)[0])

In [None]:
from tensorflow.keras import layers
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
    input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])

model.summary()

In [None]:
callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_gru_with_masking.keras",
                                    save_best_only=True),
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=2),
]

In [None]:
history1=model.fit(count_train_ds,
          validation_data = count_val_ds,
          epochs=10,
          callbacks=callbacks,
          )
model = keras.models.load_model("embeddings_bidir_gru_with_masking.keras")
print(f"Test acc: {model.evaluate(count_val_ds)[1]:.3f}")

In [None]:
plotacc(history1)

In [None]:
prediction_model = tf.keras.Sequential(
    [text_vectorization, model])

prediction_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])

# Test it with `val_ds`, which yields raw strings
loss, accuracy = prediction_model.evaluate(val_ds)
print("Accuracy: {:2.2%}".format(accuracy))

In [None]:
# Read the test data in the form of a dataframe
df_test_data = pd.read_csv('../input/newsdata/data_test_df.csv')
inputs = df_test_data['data']

In [None]:
# Make sure you use the 'prediction_model' and not the trained 'model' alone
# If you use the 'model' object, you will run int error as the data is still in the 'text' format and needs vectorization
predicted_scores = prediction_model.predict(inputs)
predicted_scores[0:5]

### Summary1  
embedding layer + masking  
score:0.13983->0.13738

## Model_2 Transformer architecture  Date 8.13-8.16

In [None]:
batch_size = 512
seed = 1337

train_ds = keras.preprocessing.text_dataset_from_directory(
    "../input/hw2-ycbs-273-intro-to-prac-ml/train", 
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.2,
    subset='training',
    seed=seed)

val_ds = keras.preprocessing.text_dataset_from_directory(
    "../input/hw2-ycbs-273-intro-to-prac-ml/train",
    batch_size=batch_size,
    label_mode='int',
    validation_split=0.2,
    subset='validation',
    seed=seed)

text_only_train_ds = train_ds.map(lambda x, y: x)

In [None]:
# max_length = 600
# max_tokens = 20000
max_length = 650
max_tokens = 25000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
)

In [None]:
text_vectorization.adapt(text_only_train_ds)
count_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
count_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))

In [None]:
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

In [None]:
vocab_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32

In [None]:
dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )

In [None]:
#model 2 
#-----------------
# max_length = 600
# max_tokens = 20000
#score 0.137
#-----------------
# max_length = 650
# max_tokens = 25000
#score 0.130
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
    input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)

mask = embedding_layer.compute_mask(inputs)
attention_output = layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=embed_dim
            )(embedded, embedded, attention_mask=mask)

proj_input = layers.LayerNormalization()(embedded + attention_output)
proj_output = dense_proj(proj_input)

x = layers.LayerNormalization()(proj_input + proj_output)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)

In [None]:
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=2),
    keras.callbacks.ModelCheckpoint("bow_2grams_1.keras",
                                    save_best_only=True)
]

In [None]:
history2 = model.fit(count_train_ds,validation_data = count_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("bow_2grams_1.keras")
print(f"Test acc: {model.evaluate(count_val_ds)[1]:.3f}")

In [None]:
plotacc(history2)

In [None]:
prediction_model = tf.keras.Sequential(
    [text_vectorization, model])

prediction_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])
loss, accuracy = prediction_model.evaluate(val_ds)
print("Accuracy: {:2.2%}".format(accuracy))

In [None]:
df_test_data = pd.read_csv('../input/hw2-ycbs-273-intro-to-prac-ml/data_test_df.csv')
inputs = df_test_data['data']

In [None]:
predicted_scores = prediction_model.predict(inputs)
predicted_scores[0:5]

## Summary2
 combine the Transformer encoder and a pooling layer
##### model 2  
 max_length = 600  
 max_tokens = 20000  
score 0.137  

 max_length = 650  
 max_tokens = 25000  
score 0.137-->0.130

## Model 4:Sequence model_2(pretrained word embeddings)   Date 8.17

In [None]:
max_length = 650
max_tokens = 25000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
)

In [None]:
text_vectorization.adapt(text_only_train_ds)
count_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
count_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))

In [None]:
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

In [None]:
text_batch, label_batch = next(iter(train_ds))
first_news, first_label = text_batch[0], label_batch[0]
print("News", first_news)
print("Label", first_label)

In [None]:
def count_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return text_vectorization(text), label

In [None]:
print("'count' vectorized question:",
      count_vectorize_text(first_news, first_label)[0])

In [None]:
#use glove300
import numpy as np
path_to_glove_file = "../input/glove6b/glove.6B.300d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

In [None]:
embedding_dim = 300

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

In [None]:
#model 4
#score 0.126
#use LSTM and glove
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=2),
    keras.callbacks.ModelCheckpoint("bow_2grams_1.keras",
                                    save_best_only=True)
]

In [None]:
history4 = model.fit(count_train_ds,validation_data = count_val_ds, epochs=20, callbacks=callbacks)
model = keras.models.load_model("bow_2grams_1.keras")
print(f"Test acc: {model.evaluate(count_val_ds)[1]:.3f}")

In [None]:
plotacc(history4)

In [None]:
prediction_model = tf.keras.Sequential(
    [text_vectorization, model])
prediction_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])
loss, accuracy = prediction_model.evaluate(val_ds)
print("Accuracy: {:2.2%}".format(accuracy))

In [None]:
df_test_data = pd.read_csv('../input/hw2-ycbs-273-intro-to-prac-ml/data_test_df.csv')
inputs = df_test_data['data']

In [None]:
predicted_scores = prediction_model.predict(inputs)
predicted_scores[0:5]

## Summary 3  
##### model 4    
use LSTM and glove300  
score 0.130-->0.126  

## Model 4 update Date 8.18

In [None]:
# max_length = 650
# max_tokens = 25000
max_length = 700
max_tokens = 30000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
)

In [None]:
text_vectorization.adapt(text_only_train_ds)
count_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
count_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))

In [None]:
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

In [None]:
def count_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return text_vectorization(text), label

In [None]:
print("'count' vectorized question:",
      count_vectorize_text(first_news, first_label)[0])

In [None]:
import numpy as np
path_to_glove_file = "../input/glove6b/glove.6B.300d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

In [None]:
embedding_dim = 300

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

In [None]:
#model 4 last update 2021/08/18

#1.score 0.126
#use LSTM and glove
#---------------
#2.score 0.120
#take LSTM node from 32 to 64
#---------------
#3.score 0.119
#enlarge the max tokens and max length
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(64))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(4, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

In [None]:
history5 = model.fit(count_train_ds,validation_data = count_val_ds, epochs=20, callbacks=callbacks)
model = keras.models.load_model("bow_2grams_1.keras")
print(f"Test acc: {model.evaluate(count_val_ds)[1]:.3f}")

In [None]:
plotacc(history5)

In [None]:
prediction_model = tf.keras.Sequential(
    [text_vectorization, model])
prediction_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer='adam',
    metrics=['accuracy'])
loss, accuracy = prediction_model.evaluate(val_ds)
print("Accuracy: {:2.2%}".format(accuracy))

In [None]:
df_test_data = pd.read_csv('../input/hw2-ycbs-273-intro-to-prac-ml/data_test_df.csv')
inputs = df_test_data['data']

In [None]:
predicted_scores = prediction_model.predict(inputs)
predicted_scores[0:5]

### Summary 4  
##### model 4 update    
take LSTM node from 32 to 64  
score 0.126-->0.120 

enlarge the max tokens and max length from (650,25000) to (700,30000)  
score 0.120-->0.119

## Save the prediction

In [None]:
# populating the dataframe to make a submission on Kaggle
df_predictions = pd.DataFrame(predicted_scores, columns=['solution_' + str(i+1) for i in range(4)])
df_predictions.index.rename('Id', inplace=True)
df_predictions.head(30)

In [None]:
df_predictions.to_csv('df_predictions.csv')