In [1]:
import keras
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub
from tensorflow.keras.layers import TextVectorization, Embedding
import io

**Typical NLP Architecture**

`Text → Tokenization → Embedding → RNN/LSTM/Transformer → Output`

In [2]:
train_texts = [
    "I love machine learning and artificial intelligence.",
    "TensorFlow makes building neural networks easy.",
    "The weather is terrible today, I hate the rain.",
    "Python is a great programming language for data science.",
    "Deep learning requires a lot of computing power.",
    "This movie was fantastic, the plot was amazing.",
    "I did not enjoy the food at that restaurant."
]

**TextVectorization Layer**

In [None]:
# Create TextVectorization layer
text_vectorizer = TextVectorization(
    max_tokens=10000,  # Maximum vocabulary size
    output_sequence_length=250,  # Pad/truncate to this length
    output_mode='int'  # Return integer sequences
)

# Adapt to training data
text_vectorizer.adapt(train_texts)


# Convert text to numbers
text_vectorized = text_vectorizer(train_texts)
print(f"Vocabulary size: {text_vectorizer.vocabulary_size()}")

Vocabulary size: 49


**Creating Embeddings**

In [13]:
embedding = Embedding(
    input_dim=10000,  # Vocabulary size
    output_dim=128,   # Embedding dimension
    input_length=250  # Sequence length
)



**NLP Models with TensorFlow**

- Model 0: Baseline (Dense Layers Only):

In [15]:
mode_0 = keras.Sequential([
    text_vectorizer,
    embedding,
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, 'sigmoid')
])

mode_0.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


- Model 1: Deep Dense Model:

In [16]:
model_1 = keras.Sequential([
    text_vectorizer,
    embedding,
    layers.GlobalAveragePooling1D(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

- Model 2: LSTM (Long Short-Term Memory):

In [17]:
model_2 = keras.Sequential([
    text_vectorizer,
    embedding,
    layers.LSTM(64, return_sequences=True), # Return sequences for stacking
    layers.LSTM(32),
    layers.Dense(1, activation='sigmoid')
])

- Model 3: GRU (Gated Recurrent Unit):

In [None]:
model_2 = keras.Sequential([
    text_vectorizer,
    embedding,
    layers.GRU(64, return_sequences=True), # Return sequences for stacking
    layers.GRU(32),
    layers.Dense(1, activation='sigmoid')
])

- Model 4: Bidirectional RNN:

In [18]:
model_4 = keras.Sequential([
    text_vectorizer,
    embedding,
    layers.Bidirectional(layers.LSTM(64)),
    layers.Dense(1, activation='sigmoid')
])

- Model 5: Conv1D for Text:

In [19]:
model_5 = keras.Sequential([
    text_vectorizer,
    embedding,
    layers.Conv1D(filters=64, kernel_size=5, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

**Transfer Learning for NLP with TensorFlow Hub**

- Using Pre-trained Embeddings:

In [22]:
embedding_layer = hub.KerasLayer(
    "https://tfhub.dev/google/universal-sentence-encoder/4",
    dtype=tf.string,
    trainable=False     # Freeze embeddings
)


# Model with pre-trained embeddings
model_6 = keras.Sequential([
    keras.Input(shape=[], dtype=tf.string),
    layers.Lambda(lambda x: embedding_layer(x), output_shape=(512,)),

    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

- Visualizing Word Embeddings:

In [28]:
# Get embedding weights
embedding_weights = embedding.get_weights()[0]

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

vocab = text_vectorizer.get_vocabulary()
for index, word in enumerate(vocab):
    vec = embedding_weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

**Using tf.data API for Efficient Text Processing**

In [None]:
def create_text_dataset(texts, labels, batch_size=32, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

train_dataset = create_text_dataset(train_texts, train_labels)
val_dataset = create_text_dataset(val_texts, val_labels, shuffle=False)

**Evaluating NLP Models**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred_probs = model.predict(test_texts)
y_pred = tf.round(y_pred_probs)

cm = confusion_matrix(test_labels, y_pred)
print(classification_report(test_labels, y_pred))

# Visualize most wrong predictions
wrong_predictions = []
for i, (text, true_label, pred_prob) in enumerate(zip(test_texts, test_labels, y_pred_probs)):
    if (true_label == 1 and pred_prob < 0.5) or (true_label == 0 and pred_prob > 0.5):
        wrong_predictions.append({
            'text': text,
            'true': true_label,
            'pred': pred_prob[0]
        })