<a href="https://colab.research.google.com/github/nirajlondhe8/aiml/blob/main/exercise/discoverPatternsInTokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Sample text data
sentences = [
    "This is the first sentence.",
    "This is the second sentence.",
    "Third sentence. See?",
]

# Create a Tokenizer
tokenizer = Tokenizer(num_words=1000)  # Limit vocabulary size
tokenizer.fit_on_texts(sentences)

# Convert sentences to sequences of integers
sequences = tokenizer.texts_to_sequences(sentences)
print("Sequences:", sequences)

# Pad sequences to have the same length (optional if sentences are similar in length)
padded_sequences = pad_sequences(sequences, padding='post')
print("Padded Sequences:", padded_sequences)

# Create a simple neural network model (consider a more complex model for real-world tasks)
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(1000, 128),  # 1000 words, 128-dimensional embeddings
    # Remove the extra dimension before GlobalAveragePooling1D
    tf.keras.layers.Reshape((-1, 128)),  # Reshape to (batch_size, embedding_dimension)
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(10, activation='relu'),  # Adjust units based on task complexity
    tf.keras.layers.Dense(len(sentences), activation='sigmoid')  # Output layer with one unit per sentence
])

# Compile the model for multi-label classification (assuming labels are binary)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Labels (replace with your actual labels, one for each sentence)
labels = np.array([[1, 0, 1]])  # Example labels (replace with actual labels)

# Reshape the input data for the model (ensure each sample has its own dimension)
reshaped_padded_sequences = np.expand_dims(padded_sequences, axis=0)  # Reshape for 1 sample
print("Reshaped Padded Sequences:", reshaped_padded_sequences.shape)

# Train the model (consider using a validation set for real-world tasks)
model.fit(reshaped_padded_sequences, labels, epochs=10, batch_size=1)

# Evaluate the model (on the same data for this example)
loss, accuracy = model.evaluate(reshaped_padded_sequences, labels)
print("Loss:", loss)
print("Accuracy:", accuracy)

Sequences: [[2, 3, 4, 5, 1], [2, 3, 4, 6, 1], [7, 1, 8]]
Padded Sequences: [[2 3 4 5 1]
 [2 3 4 6 1]
 [7 1 8 0 0]]
Reshaped Padded Sequences: (1, 3, 5)
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 1.0000 - loss: 0.6939
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 0.6885
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 0.6839
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 0.6798
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 1.0000 - loss: 0.6757
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 0.6714
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 1.0000 - loss: 0.6666
Epoch 8/10
[1m1/1[0m [32m━━━