In [None]:
# Import necessary libraries
import tensorflow as tf
import numpy as np
import os
import glob

# Step 1: Load Shakespeare dataset
path_to_file = tf.keras.utils.get_file("shakespeare.txt",
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# Step 2: Preprocess the text
vocab = sorted(set(text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

# Step 3: Create training sequences
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

# Step 4: Create training batches
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# Step 5: Build the model
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    return tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.LSTM(rnn_units, return_sequences=True,
                             stateful=True, recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])

model = build_model(vocab_size, embedding_dim, rnn_units, BATCH_SIZE)

# Step 6: Compile the model
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# Step 7: Train the model (checkpoints optional)
EPOCHS = 5  # You can increase this for better results
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

print("Starting model training...")
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
print("Model training finished.")

# Diagnose: Check if checkpoint directory exists and list its contents
print("\nChecking checkpoint directory:")
if os.path.exists(checkpoint_dir):
    print(f"Checkpoint directory exists: {checkpoint_dir}")
    print("Contents of checkpoint directory:")
    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "*.weights.h5"))
    if checkpoint_files:
        print("Checkpoint files found:")
        for f in checkpoint_files:
            print(f)
    else:
        print("No .weights.h5 files found in checkpoint directory.")

    # Manually find the latest checkpoint file
    latest_checkpoint = None
    if checkpoint_files:
        latest_checkpoint = max(checkpoint_files, key=os.path.getmtime)
        print(f"\nManually identified latest checkpoint: {latest_checkpoint}")
    else:
        print("\nCould not identify latest checkpoint as no .weights.h5 files were found.")

else:
    print(f"Checkpoint directory does NOT exist: {checkpoint_dir}")
    latest_checkpoint = None


# Step 8: Rebuild model for generation (batch_size = 1)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.build(tf.TensorShape([1, None])) # Build model before loading weights

# Use the manually found latest checkpoint path
if latest_checkpoint:
    print(f"Loading weights from: {latest_checkpoint}")
    model.load_weights(latest_checkpoint)
    print("Weights loaded successfully.")
else:
    print("Could not load weights. No checkpoint found.")


# Step 9: Generate text function
def generate_text(model, start_string, temperature=1.0):
    num_generate = 500
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)
    text_generated = []

    # Access the LSTM layer and call reset_states()
    model.layers[1].reset_states()

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

# Step 10: Generate sample text
print("\nGenerated Text:\n")
if latest_checkpoint:
  print(generate_text(model, start_string="To be, or not to be", temperature=0.8))
else:
  print("Cannot generate text without loaded weights.")

Starting model training...
Epoch 1/5
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 71ms/step - loss: 2.8418
Epoch 2/5
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - loss: 1.8275
Epoch 3/5
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 72ms/step - loss: 1.5823
Epoch 4/5
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - loss: 1.4642
Epoch 5/5
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - loss: 1.3964
Model training finished.

Checking checkpoint directory:
Checkpoint directory exists: ./training_checkpoints
Contents of checkpoint directory:
Checkpoint files found:
./training_checkpoints/ckpt_3.weights.h5
./training_checkpoints/ckpt_4.weights.h5
./training_checkpoints/ckpt_2.weights.h5
./training_checkpoints/ckpt_1.weights.h5
./training_checkpoints/ckpt_5.weights.h5

Manually identified latest checkpoint: ./training_checkpoints/ckpt_5.weights.h5
Loading we

In [28]:
# Install and download required NLTK resources
import nltk
import os

# Clear NLTK data path cache and set to a writable directory
nltk.data.clear_cache()
nltk.data.path.append('/root/nltk_data')

# Import necessary modules
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# NLP preprocessing function
def nlp_preprocessing(sentence):
    # 1. Tokenization
    tokens = word_tokenize(sentence)

    # 2. Stopwords Removal
    stop_words = set(stopwords.words('english'))
    tokens_no_stopwords = [word for word in tokens if word.lower() not in stop_words]

    # 3. Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokens_no_stopwords]

    # Print the results
    print("Original Tokens:", tokens)
    print("Tokens Without Stopwords:", tokens_no_stopwords)
    print("Stemmed Words:", stemmed_words)

# Test the function
test_sentence = "NLP techniques are used in virtual assistants like Alexa and Siri."
nlp_preprocessing(test_sentence)

Original Tokens: ['NLP', 'techniques', 'are', 'used', 'in', 'virtual', 'assistants', 'like', 'Alexa', 'and', 'Siri', '.']
Tokens Without Stopwords: ['NLP', 'techniques', 'used', 'virtual', 'assistants', 'like', 'Alexa', 'Siri', '.']
Stemmed Words: ['nlp', 'techniqu', 'use', 'virtual', 'assist', 'like', 'alexa', 'siri', '.']


In [29]:
import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "Barack Obama served as the 44th President of the United States and won the Nobel Peace Prize in 2009."
doc = nlp(sentence)

for ent in doc.ents:
    print(f"Text: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")


Text: Barack Obama, Label: PERSON, Start: 0, End: 12
Text: 44th, Label: ORDINAL, Start: 27, End: 31
Text: the United States, Label: GPE, Start: 45, End: 62
Text: the Nobel Peace Prize, Label: WORK_OF_ART, Start: 71, End: 92
Text: 2009, Label: DATE, Start: 96, End: 100


In [30]:
import numpy as np
import math

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=-1, keepdims=True)

def scaled_dot_product_attention(Q, K, V):
    d = Q.shape[-1]
    scores = np.dot(Q, K.T) / math.sqrt(d)
    weights = softmax(scores)
    output = np.dot(weights, V)
    return weights, output

Q = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
K = np.array([[1, 0, 1, 0], [0, 1, 0, 1]])
V = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])

weights, output = scaled_dot_product_attention(Q, K, V)
print("Attention Weights:\n", weights)
print("Output Matrix:\n", output)


Attention Weights:
 [[0.73105858 0.26894142]
 [0.26894142 0.73105858]]
Output Matrix:
 [[2.07576569 3.07576569 4.07576569 5.07576569]
 [3.92423431 4.92423431 5.92423431 6.92423431]]


In [31]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
result = classifier("Despite the high price, the performance of the new MacBook is outstanding.")[0]

print(f"Sentiment: {result['label']}")
print(f"Confidence Score: {result['score']:.4f}")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Sentiment: POSITIVE
Confidence Score: 0.9998


In [32]:
# Manually download stopwords file
import os
import requests

stopwords_url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip'
stopwords_dir = '/root/nltk_data/corpora/stopwords/'
stopwords_file = os.path.join(stopwords_dir, 'english')

# Create directories if they don't exist
os.makedirs(stopwords_dir, exist_ok=True)

# Download and extract the zip file
zip_path = os.path.join('/tmp', 'stopwords.zip')
r = requests.get(stopwords_url, stream=True)
with open(zip_path, 'wb') as f:
    for chunk in r.iter_content(chunk_size=8192):
        f.write(chunk)

import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/root/nltk_data/corpora/')

print(f"Stopwords file should be at: {stopwords_file}")
if os.path.exists(stopwords_file):
    print("Stopwords file found after manual download and extraction.")
else:
    print("Stopwords file NOT found after manual download and extraction.")

Stopwords file should be at: /root/nltk_data/corpora/stopwords/english
Stopwords file found after manual download and extraction.
