In [1]:
!pip install wget --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wget (setup.py) ... [?25l[?25hdone


In [2]:
!pip install --upgrade tensorflow==2.12.0 tensorflow_hub==0.13.0 --quiet

In [3]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py --quiet

In [4]:
!wget "https://storage.googleapis.com/ztm_tf_course/nlp_getting_started.zip" --quiet

In [5]:
# Import series of helper functions for the notebook
from helper_functions import unzip_data

unzip_data("nlp_getting_started.zip")

In [6]:
# Importing TensorFlow and Keras libraries
import tensorflow as tf
from keras import Sequential
from keras.layers import Input, GlobalAveragePooling1D, Dense, LSTM, Conv1D, GlobalMaxPool1D
from keras.layers import TextVectorization, Embedding
from keras.optimizers import Adam
import tensorflow_hub as hub

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from pathlib import Path

In [7]:
class PreprocessData:

    def __init__(self):
        self.train_df = pd.read_csv('train.csv')
        self.test_df = pd.read_csv('test.csv')

        self.train_df_shuffled = self.train_df.sample(frac=1, random_state=42)

        self.train_sentences = None
        self.train_labels = None
        self.val_sentences = None
        self.val_labels = None

        self.max_vocab_length = 10000
        self.max_output_sequence_length = 15

    def get_train_dataframe(self):
        return self.train_df_shuffled

    def get_test_dataframe(self):
        return self.test_df


In [8]:
# Initialize dataset for this file
preprocess_data = PreprocessData()

# Read train dataset data
train_df = preprocess_data.get_train_dataframe()
# Read test dataset data
test_df = preprocess_data.get_test_dataframe()

# Split sentences and target (labels) from dataset
train_sentences = train_df["text"].to_numpy()
train_labels = train_df["target"].to_numpy()
test_sentences = test_df["text"].to_numpy()

In [9]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[],  # shape of input coming to our model
                                        dtype=tf.string,  # data type of input coming to the USE layer
                                        trainable=False,  # keep the pretrained weight
                                                          # (we'll create a feature extractor)
                                        name="USE")

In [10]:
# Create model using the Sequential API
ex_model_6 = Sequential([
    sentence_encoder_layer,
    Dense(64, activation="relu"),
    Dense(1, activation="sigmoid")
], name="model_6_USE")

In [11]:
# Compile model
ex_model_6.compile(loss="binary_crossentropy",
                    optimizer=Adam(),
                    metrics=["accuracy"])

# model_6.summary()

In [12]:
# Train a classifier on top of pretrained embeddings
ex_model_6_history = ex_model_6.fit(train_sentences,
                                    train_labels,
                                    epochs=5,
                                    callbacks=[])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
# Make predictions with USE TF Hub model
ex_model_6_pred_probs = ex_model_6.predict(test_sentences)
# print(model_6_pred_probs[:10])

# Convert prediction probabilities to labels
ex_model_6_preds = tf.squeeze(tf.round(ex_model_6_pred_probs))
# print(model_6_preds[:10])



In [14]:
# Check predicted labels
print(ex_model_6_preds)
# Convert dtype in preds
ex_model_6_preds = tf.cast(ex_model_6_preds, tf.int32)
print(ex_model_6_preds)

# Form new dataframe for this predictions
pred_df = pd.DataFrame({"id": test_df["id"].to_numpy(), "target": ex_model_6_preds.numpy()})

# Check new dataframe
print(pred_df.head())

tf.Tensor([1. 1. 1. ... 1. 1. 1.], shape=(3263,), dtype=float32)
tf.Tensor([1 1 1 ... 1 1 1], shape=(3263,), dtype=int32)
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1


In [15]:
    # Save result to csv file
    submission_path = Path("output/nlp_with_tf/nlp_submission.csv")
    submission_path.parent.mkdir(parents=True, exist_ok=True)
    pred_df.to_csv(submission_path, index=False)