<a href="https://colab.research.google.com/github/osjayaprakash/deeplearning/blob/main/cnn_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


```
## This is formatted as code
python3 -m venv ~/cs230
source ~/cs230/bin/activate
pip3 install kagglehub kaggle tensorflow tensorflow-macos tensorflow-metal
brew install hdf5
pip install line_profiler

## AMAZON AWS
sudo apt update
sudo apt install nvidia-driver-535
reboot  # Restart the system after installation

nvidia-smi

## Verify that the GPUs are available
python3
import tensorflow as tf
tf.sysconfig.get_build_info()
python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices());"
```

In [None]:
!pip install line_profiler
!pip install memory_profiler
!pip install kagglehub
!pip install matplotlib pandas scikit-learn scipy tensorflow
!pip install tensorflow[and-cuda]
%load_ext line_profiler
%load_ext memory_profiler

In [None]:
import kagglehub

# Download latest version
root_dir = kagglehub.dataset_download("shahrukhkhan/im2latex100k")
# path = kagglehub.dataset_download("gregoryeritsyan/im2latex-230k")

print("Path to dataset files:", root_dir)

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Flatten,
                                     Dense, GRU, Embedding, Bidirectional,
                                     TimeDistributed, Concatenate, RepeatVector, LSTM)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
import platform
import sys
import pandas as pd
import sklearn as sk
import scipy as sp

tf.config.experimental.list_physical_devices('GPU')
print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
#print(f"Keras Version: {tf.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print(f"SciPy {sp.__version__}")
print(tf.config.list_physical_devices())

# Initialize Tokenizer (Configure it with LaTeX vocabulary)
vocab_size = None  # Adjust based on your dataset
max_seq_length = 250  # Max length of output sequence
RESNET_MODEL = True
IMG_SIZE = [50,224,1] # height, width, channels
BASE_DIR = "/Users/jayaprakash/"
#BASE_DIR = "/home/ubuntu/"

OUTPUT_MODEL_NAME = "latex_model_cnn_lstm"
EMBEDDING_DIM = 256
lstm_units = 265
if RESNET_MODEL:
    IMG_SIZE = [224,224,3]
    OUTPUT_MODEL_NAME = "latex_model_resnet_lstm"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False)
def fit_tokenizer(texts):
    """Fit the tokenizer on the LaTeX text corpus."""
    tokenizer.fit_on_texts(texts)
    vocab_size = len(tokenizer.word_index) + 1
    # max_seq_length = max(len(seq) for seq in tokenizer.texts_to_sequences(texts))
    acutal_max_seq_length = max(len(seq) for seq in tokenizer.texts_to_sequences(texts))
    assert max_seq_length >= acutal_max_seq_length,  f"max_seq_length ({max_seq_length}) should be greater than or equal to acutal_max_seq_length ({acutal_max_seq_length})"
    print(f"Vocabulary size: {vocab_size}, Max sequence length: {max_seq_length}")
    return vocab_size, max_seq_length

def text_to_sequence(text):
    """Convert LaTeX text to a sequence of tokens."""
    return tokenizer.texts_to_sequences([text])[0]

def sequence_to_text(sequence):
    """Convert token sequence back to LaTeX text."""
    return tokenizer.sequences_to_texts([sequence])[0]

# Preprocess

In [None]:
def preprocess_image(image):
    """Preprocess the input image: Resize and normalize."""
    image = tf.image.resize(image, (IMG_SIZE[0], IMG_SIZE[1]))
    image = image / 255.0  # Normalize to [0, 1]
    return image

def load_and_preprocess_images(image_paths):
    """Load and preprocess a batch of images."""
    # Use Gray scale
    images = [
        preprocess_image(
            tf.io.decode_image(
                tf.io.read_file(path), channels=IMG_SIZE[2]))
              for path in image_paths]
    return tf.stack(images)

def prepare_sequences(latex_texts, max_seq_length):
    """Convert LaTeX texts to padded sequences of tokens."""
    sequences = [text_to_sequence(text) for text in latex_texts]
    return pad_sequences(sequences, maxlen=max_seq_length, padding='post')


In [None]:
%%time
%%prun

import pandas as pd

df = pd.read_csv(f"{root_dir}/im2latex_train.csv", nrows=1000)

train_image_paths = []
train_latex_texts = []

for index, row in df.iterrows():
    train_image_paths += [f"{root_dir}//formula_images_processed/formula_images_processed/{row.image}"]
    train_latex_texts += ["<START> " + row.formula + " <END>"]

# Enable Numpy behaviour of TF
tf.experimental.numpy.experimental_enable_numpy_behavior()

vocab_size, max_seq_length = fit_tokenizer(train_latex_texts)

train_images = load_and_preprocess_images(train_image_paths)
train_sequences = prepare_sequences(train_latex_texts, max_seq_length)
train_sequences = np.expand_dims(train_sequences, -1)
print("train_images:", train_images.shape)
print("train_sequences:", train_sequences.shape)

In [None]:
print(train_sequences.shape, train_images.shape)
print(type(train_sequences))
print(train_sequences[0].reshape(1,-1).tolist()[0])
print(sequence_to_text(train_sequences[0].reshape(1,-1).tolist()[0]))

In [None]:
print(train_image_paths[0])
!ls -lart "{train_image_paths[0]}"
print(train_latex_texts[0])

## Model

In [None]:
max_seq_len_1 = max_seq_length-1

from tensorflow import keras
from tensorflow.keras.layers import Layer
from tensorflow.keras.applications import ResNet50

# CNN Encoder
image_input = Input(shape=(IMG_SIZE[0], IMG_SIZE[1], IMG_SIZE[2]), name="image_input")
if RESNET_MODEL is False:
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(image_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
else:
    #rgb_image = tf.keras.layers.Lambda(lambda x: tf.image.grayscale_to_rgb(x), output_shape=(None, ))(image_input)
    resnet = ResNet50(include_top=False, weights="imagenet", input_tensor=image_input)
    # Freeze ResNet layers
    for layer in resnet.layers:
        layer.trainable = False
    # Pooling to reduce dimensions
    x = tf.keras.layers.GlobalAveragePooling2D()(resnet.output)
x = Dense(EMBEDDING_DIM, activation='relu')(x)
# Repeat encoder output for each time step
encoder_output = RepeatVector(max_seq_len_1)(x)

# LSTM Decoder with Attention
decoder_input = Input(shape=(max_seq_len_1,), name="decoder_input")  # Sequence input for teacher forcing
embedding_layer = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_seq_len_1)
embedded_seq = embedding_layer(decoder_input)

decoder_lstm_input = tf.keras.layers.Concatenate(axis=-1)([encoder_output, embedded_seq])
decoder_lstm = LSTM(lstm_units, return_sequences=True)(decoder_lstm_input)
output_layer = TimeDistributed(Dense(vocab_size, activation="softmax"))(decoder_lstm)

# Build Model
model = Model(inputs=[image_input, decoder_input], outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
#dot_img_file =
import keras
keras.utils.plot_model(model,
                       show_shapes=True,
                       show_dtype=True,
                       show_layer_names=True,
                       expand_nested=True,
                       show_layer_activations=True,
                       )

In [None]:
model.fit([train_images, train_sequences[:, :-1]],
          train_sequences[:, 1:],
          epochs=20,
          batch_size=128,
          validation_split=0.2)

from tensorflow.keras.models import load_model
model.save(f'{BASE_DIR}/{OUTPUT_MODEL_NAME}.keras')

In [None]:
from tensorflow.keras.models import load_model
model = load_model(f'{BASE_DIR}/{OUTPUT_MODEL_NAME}.keras')
model.summary()

In [None]:
import keras
keras.utils.plot_model(model,
                       show_shapes=True,
                       show_dtype=True,
                       show_layer_names=True,
                       expand_nested=True,
                       show_layer_activations=True,
                       to_file=f'{BASE_DIR}/{OUTPUT_MODEL_NAME}.png'
                       )

# Predict

In [None]:
import numpy as np

def predict_latex_sequence(model, image, tokenizer):
    """
    Predict LaTeX sequence from a single image.

    Parameters:
    - model: Trained Keras model for predicting LaTeX sequence.
    - image: Input image (preprocessed to match training dimensions).
    - tokenizer: Tokenizer fitted on LaTeX sequences for decoding predictions.
    - max_seq_len: Maximum sequence length for the predicted sequence.

    Returns:
    - latex_sequence: Predicted LaTeX sequence as a string.
    """
    # Prepare input image and initialize the sequence
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    start_token = tokenizer.word_index["<START>"]
    end_token = tokenizer.word_index["<END>"]

    # Initial sequence with the start token
    sequence = [start_token]

    for _ in range(max_seq_len_1):
        # Pad the current sequence to match input length
        padded_sequence = np.pad(sequence, (0, max_seq_len_1 - len(sequence)), mode='constant')
        padded_sequence = np.expand_dims(padded_sequence, axis=0)  # Add batch dimension

        # Predict next token
        preds = model.predict([image, padded_sequence])
        next_token = np.argmax(preds[0, len(sequence) - 1, :])

        # Break if end token is reached
        if next_token == end_token:
            break

        # Add the predicted token to the sequence
        sequence.append(next_token)

    # Decode the token sequence to a string
    latex_sequence = tokenizer.sequences_to_texts([sequence[1:]])[0]  # Skip the start token
    return latex_sequence

predicted_latex = predict_latex_sequence(model, train_images[12], tokenizer)
print("Predicted LaTeX:", predicted_latex)
#print("Original Seq:", train_sequences[0])
print("Original Seq:", train_latex_texts[12])