<a href="https://colab.research.google.com/github/osjayaprakash/deeplearning/blob/main/cnn_lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


```
## This is formatted as code
python3 -m venv ~/cs230
source ~/cs230/bin/activate
pip3 install kagglehub kaggle tensorflow tensorflow-macos tensorflow-metal
brew install hdf5
pip install line_profiler

## AMAZON AWS
sudo apt update
sudo apt install nvidia-driver-535
reboot  # Restart the system after installation

nvidia-smi

## Verify that the GPUs are available
python3
import tensorflow as tf
tf.sysconfig.get_build_info()
python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices());"
```

In [None]:
!pip install line_profiler
!pip install memory_profiler
!pip install kagglehub
!pip install matplotlib pandas scikit-learn scipy tensorflow
!pip install tensorflow[and-cuda]
%load_ext line_profiler
%load_ext memory_profiler

In [35]:
import kagglehub

# Download latest version
root_dir = kagglehub.dataset_download("shahrukhkhan/im2latex100k")
# path = kagglehub.dataset_download("gregoryeritsyan/im2latex-230k")

print("Path to dataset files:", root_dir)

Path to dataset files: /home/ubuntu/.cache/kagglehub/datasets/shahrukhkhan/im2latex100k/versions/7


In [49]:
import tensorflow as tf
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Flatten,
                                     Dense, GRU, Embedding, Bidirectional,
                                     TimeDistributed, Concatenate, RepeatVector, LSTM)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import matplotlib.pyplot as plt
import platform
import sys
import pandas as pd
import sklearn as sk
import scipy as sp

tf.config.experimental.list_physical_devices('GPU')
print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
#print(f"Keras Version: {tf.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print(f"SciPy {sp.__version__}")
print(tf.config.list_physical_devices())

# Initialize Tokenizer (Configure it with LaTeX vocabulary)
vocab_size = None  # Adjust based on your dataset
max_seq_length = 149  # Max length of output sequence
RESNET_MODEL = True
IMG_SIZE = [50,224,1] # height, width, channels
BASE_DIR = "/Users/jayaprakash/"
#BASE_DIR = "/home/ubuntu/"

OUTPUT_MODEL_NAME = "latex_model_cnn_lstm"
EMBEDDING_DIM = 256
lstm_units = 265
if RESNET_MODEL:
    IMG_SIZE = [224,224,3]
    OUTPUT_MODEL_NAME = "latex_model_resnet_lstm"

Python Platform: Linux-5.15.0-1044-aws-x86_64-with-glibc2.29
Tensor Flow Version: 2.13.1

Python 3.8.10 (default, Nov  7 2024, 13:10:47) 
[GCC 9.4.0]
Pandas 2.0.3
Scikit-Learn 1.3.2
SciPy 1.10.1
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [50]:
tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False)
def fit_tokenizer(texts):
    """Fit the tokenizer on the LaTeX text corpus."""
    tokenizer.fit_on_texts(texts)
    vocab_size = len(tokenizer.word_index) + 1
    # max_seq_length = max(len(seq) for seq in tokenizer.texts_to_sequences(texts))
    acutal_max_seq_length = max(len(seq) for seq in tokenizer.texts_to_sequences(texts))
    assert max_seq_length >= acutal_max_seq_length,  f"max_seq_length ({max_seq_length}) should be greater than or equal to acutal_max_seq_length ({acutal_max_seq_length})"
    print(f"Vocabulary size: {vocab_size}, Max sequence length: {max_seq_length}")
    return vocab_size, max_seq_length

def text_to_sequence(text):
    """Convert LaTeX text to a sequence of tokens."""
    return tokenizer.texts_to_sequences([text])[0]

def sequence_to_text(sequence):
    """Convert token sequence back to LaTeX text."""
    return tokenizer.sequences_to_texts([sequence])[0]

# Preprocess

In [51]:
def preprocess_image(image):
    """Preprocess the input image: Resize and normalize."""
    image = tf.image.resize(image, (IMG_SIZE[0], IMG_SIZE[1]))
    image = image / 255.0  # Normalize to [0, 1]
    return image

def load_and_preprocess_images(image_paths):
    """Load and preprocess a batch of images."""
    # Use Gray scale
    images = [
        preprocess_image(
            tf.io.decode_image(
                tf.io.read_file(path), channels=IMG_SIZE[2]))
              for path in image_paths]
    return tf.stack(images)

def prepare_sequences(latex_texts, max_seq_length):
    """Convert LaTeX texts to padded sequences of tokens."""
    sequences = [text_to_sequence(text) for text in latex_texts]
    return pad_sequences(sequences, maxlen=max_seq_length, padding='post')


In [52]:
%%time
%%prun

import pandas as pd

df = pd.read_csv(f"{root_dir}/im2latex_train.csv", nrows=1000)

train_image_paths = []
train_latex_texts = []

for index, row in df.iterrows():
    train_image_paths += [f"{root_dir}//formula_images_processed/formula_images_processed/{row.image}"]
    train_latex_texts += ["<START> " + row.formula + " <END>"]

# Enable Numpy behaviour of TF
tf.experimental.numpy.experimental_enable_numpy_behavior()

# vocab_size, max_seq_length = fit_tokenizer(train_latex_texts)

tokenizer = tf.keras.layers.TextVectorization(max_tokens=max_seq_length, standardize = None)
train_images = load_and_preprocess_images(train_image_paths)
# train_sequences = prepare_sequences(train_latex_texts, max_seq_length)
# train_sequences = np.expand_dims(train_sequences, -1)
tokenizer.adapt(train_latex_texts)
latex_labels = tokenizer(train_latex_texts)
train_sequences = np.asarray(latex_labels)
print("train_images:", train_images.shape)
print("train_sequences:", train_sequences.shape)

train_images: (1000, 224, 224, 3)
train_sequences: (1000, 149)
 CPU times: user 1.61 s, sys: 82.3 ms, total: 1.69 s
Wall time: 1.65 s


         960541 function calls (945895 primitive calls) in 1.621 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     6133    0.927    0.000    0.927    0.000 {built-in method tensorflow.python._pywrap_tfe.TFE_Py_FastPathExecute}
     3039    0.079    0.000    0.084    0.000 constant_op.py:65(convert_to_eager_tensor)
     1035    0.053    0.000    0.053    0.000 {built-in method tensorflow.python._pywrap_tfe.TFE_Py_Execute}
197310/197308    0.021    0.000    0.033    0.000 {built-in method builtins.isinstance}
     1000    0.020    0.000    0.923    0.001 image_ops_impl.py:1446(_resize_images_common)
7131/4065    0.019    0.000    1.226    0.000 dispatch.py:1162(op_dispatch_handler)
    48382    0.017    0.000    0.035    0.000 tensor_shape.py:725(as_dimension)
     6210    0.015    0.000    0.161    0.000 tensor_conversion_registry.py:164(convert)
     7176    0.013    0.000    0.044    0.000 tensor_shape.py:817(__init__)


In [53]:
print(train_sequences.shape, train_images.shape)
print(type(train_sequences))
print(train_sequences[0].reshape(1,-1).tolist()[0])
print(sequence_to_text(train_sequences[0].reshape(1,-1).tolist()[0]))

(1000, 149) (1000, 224, 224, 3)
<class 'numpy.ndarray'>
[13, 1, 65, 4, 3, 51, 3, 81, 97, 29, 74, 2, 2, 1, 88, 4, 3, 21, 1, 18, 2, 1, 3, 99, 2, 4, 3, 21, 2, 3, 16, 3, 7, 11, 20, 8, 5, 3, 21, 2, 2, 3, 6, 5, 3, 6, 21, 11, 10, 2, 2, 2, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


AttributeError: 'TextVectorization' object has no attribute 'sequences_to_texts'

In [None]:
print(train_image_paths[0])
!ls -lart "{train_image_paths[0]}"
print(train_latex_texts[0])

## Model

In [54]:
max_seq_len_1 = max_seq_length-1
from tensorflow import keras
from tensorflow.keras.layers import Layer
from tensorflow.keras.applications import ResNet50

vocab_size = tokenizer.vocabulary_size()

# CNN Encoder
image_input = Input(shape=(IMG_SIZE[0], IMG_SIZE[1], IMG_SIZE[2]), name="image_input")
if RESNET_MODEL is False:
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(image_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
else:
    #rgb_image = tf.keras.layers.Lambda(lambda x: tf.image.grayscale_to_rgb(x), output_shape=(None, ))(image_input)
    resnet = ResNet50(include_top=False, weights="imagenet", input_tensor=image_input)
    # Freeze ResNet layers
    for layer in resnet.layers:
        layer.trainable = False
    # Pooling to reduce dimensions
    x = tf.keras.layers.GlobalAveragePooling2D()(resnet.output)
x = Dense(EMBEDDING_DIM, activation='relu')(x)
# Repeat encoder output for each time step
encoder_output = RepeatVector(max_seq_len_1)(x)

# LSTM Decoder with Attention
decoder_input = Input(shape=(max_seq_len_1,), name="decoder_input")  # Sequence input for teacher forcing
embedding_layer = Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=max_seq_len_1)
embedded_seq = embedding_layer(decoder_input)

decoder_lstm_input = tf.keras.layers.Concatenate(axis=-1)([encoder_output, embedded_seq])
decoder_lstm = LSTM(lstm_units, return_sequences=True)(decoder_lstm_input)
output_layer = TimeDistributed(Dense(vocab_size, activation="softmax"))(decoder_lstm)

# Build Model
model = Model(inputs=[image_input, decoder_input], outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 image_input (InputLayer)    [(None, 224, 224, 3)]        0         []                            
                                                                                                  
 conv1_pad (ZeroPadding2D)   (None, 230, 230, 3)          0         ['image_input[0][0]']         
                                                                                                  
 conv1_conv (Conv2D)         (None, 112, 112, 64)         9472      ['conv1_pad[0][0]']           
                                                                                                  
 conv1_bn (BatchNormalizati  (None, 112, 112, 64)         256       ['conv1_conv[0][0]']          
 on)                                                                                        

In [55]:
# #dot_img_file =
# import keras
# keras.utils.plot_model(model,
#                        show_shapes=True,
#                        show_dtype=True,
#                        show_layer_names=True,
#                        expand_nested=True,
#                        show_layer_activations=True,
#                        )

In [56]:
model.fit([train_images, train_sequences[:, :-1]],
          train_sequences[:, 1:],
          epochs=20,
          batch_size=128,
          validation_split=0.2)

from tensorflow.keras.models import load_model
model.save(f'{BASE_DIR}/{OUTPUT_MODEL_NAME}.keras')

Epoch 1/20


2024-12-04 07:46:33.832222: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8700
2024-12-04 07:46:34.235469: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-12-04 07:46:35.752281: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f674657d120 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-12-04 07:46:35.752305: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA L4, Compute Capability 8.9
2024-12-04 07:46:35.756183: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-04 07:46:35.853345: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


FileNotFoundError: [Errno 2] No such file or directory: '/Users/jayaprakash//latex_model_resnet_lstm.keras'

In [28]:
from tensorflow.keras.models import load_model
model = load_model(f'{BASE_DIR}/{OUTPUT_MODEL_NAME}.keras')
model.summary()

OSError: No file or directory found at /Users/jayaprakash//latex_model_resnet_lstm.keras

In [None]:
import keras
keras.utils.plot_model(model,
                       show_shapes=True,
                       show_dtype=True,
                       show_layer_names=True,
                       expand_nested=True,
                       show_layer_activations=True,
                       to_file=f'{BASE_DIR}/{OUTPUT_MODEL_NAME}.png'
                       )

# Metrics for Accuracy

In [None]:
import collections
import math

def lev_distance(sequence_one, sequence_two):
    rows = len(sequence_one)
    cols = len(sequence_two)
    dist_tab = np.zeros((rows + 1, cols + 1), dtype=int)
    for i in range(1, rows + 1):
      dist_tab[i][0] = i
    for i in range(1, cols + 1):
      dist_tab[0][i] = i
    for r in range(1, rows + 1):
      for c in range(1, cols + 1):

        #if tokens match
        if sequence_one[r - 1] == sequence_two[c - 1]:

          #same cost as min cost from prev tokens
          dist_tab[r][c] = dist_tab[r - 1][c - 1]
        else:

          #min of deletion, insertion, or substitution respectively
          dist_tab[r][c] = 1 + min(dist_tab[r - 1][c], dist_tab[r][c - 1], dist_tab[r - 1][c - 1])
    return dist_tab[rows][cols] #return top right corner of table: min edit distance

def bleu_n_score(generated_sequence, true_sequence, n):
    gen_len = len(generated_sequence)
    true_len = len(true_sequence)
    scores = []

    #calculate and store precision for 1-grams to n-grams
    for gram_size in range(1,n+1):

      #calculate grams
      gen_ngrams = [tuple(generated_sequence[i:i+gram_size]) for i in range(gen_len - gram_size + 1)]
      true_ngrams = [tuple(true_sequence[i:i+gram_size]) for i in range(true_len - gram_size + 1)]

      gen_grams_count = collections.Counter(gen_ngrams) #freq dicts of grams
      true_grams_count = collections.Counter(true_ngrams)

      #sum of how many grams appear in both the gen sequence and the true
      matching_grams_sum = sum(min(gen_grams_count[gram], true_grams_count[gram]) for gram in gen_grams_count)

      #divide sum of matching grams by total number of grams in the gen sequence (precision)
      gram_score = 0
      if len(gen_grams_count) > 0:
        gram_score = matching_grams_sum / len(gen_grams_count)
      scores.append(gram_score)

    #calculate geometric mean of scores for each 1-ngram
    geo_mean = 0.0
    for gram_score in scores:
      if gram_score == 0.0:
        #return 0 early: a score of 0 zeroes out mean and thus bleu score
        return 0.0
      geo_mean += math.log(gram_score)
    geo_mean = math.exp(geo_mean/n)

    #include brevity penalty in cases where gen sequence is longer than true sequence
    if gen_len < true_len:
      return math.exp(1 - true_len / gen_len) * geo_mean
    return geo_mean #no penalty otherwise

# Predict and Evaluate

In [None]:
import collections
import math

def lev_distance(sequence_one, sequence_two):
    rows = len(sequence_one)
    cols = len(sequence_two)
    dist_tab = np.zeros((rows + 1, cols + 1), dtype=int)
    for i in range(1, rows + 1):
      dist_tab[i][0] = i
    for i in range(1, cols + 1):
      dist_tab[0][i] = i
    for r in range(1, rows + 1):
      for c in range(1, cols + 1):

        #if tokens match
        if sequence_one[r - 1] == sequence_two[c - 1]:

          #same cost as min cost from prev tokens
          dist_tab[r][c] = dist_tab[r - 1][c - 1]
        else:

          #min of deletion, insertion, or substitution respectively
          dist_tab[r][c] = 1 + min(dist_tab[r - 1][c], dist_tab[r][c - 1], dist_tab[r - 1][c - 1])
    return 1 - (dist_tab[rows][cols] / max(rows, cols)) #return top right corner of table: min edit distance, normalized by length of max sequence

def bleu_n_score(generated_sequence, true_sequence, n):
    gen_len = len(generated_sequence)
    true_len = len(true_sequence)
    scores = []

    #calculate overlap for 1-grams to n-grams
    for gram_size in range(1,n+1):

      #generate grams
      gen_ngrams = [tuple(generated_sequence[i:i+gram_size]) for i in range(gen_len - gram_size + 1)]
      true_ngrams = [tuple(true_sequence[i:i+gram_size]) for i in range(true_len - gram_size + 1)]

      gen_grams_count = collections.Counter(gen_ngrams) #freq dictionaries of grams
      true_grams_count = collections.Counter(true_ngrams)

      #sum of how many grams appear in both the gen sequence and the true
      matching_grams_sum = sum(min(gen_grams_count[gram], true_grams_count[gram]) for gram in gen_grams_count)

      #divide sum of matching grams by total number of grams in the gen sequence (precision)
      gram_score = 0
      if len(gen_grams_count) > 0:
        gram_score = matching_grams_sum / len(gen_grams_count)
      scores.append(gram_score)

    #calculate geometric mean of scores for each gram 1-n
    geo_mean = 0.0
    for gram_score in scores:
      if gram_score == 0.0:
        #return 0 early: a score of 0 zeroes out mean and thus bleu score
        return 0.0
      geo_mean += math.log(gram_score)
    geo_mean = math.exp(geo_mean/n)

    #include brevity penalty in cases where gen sequence is longer than true sequence
    if gen_len < true_len:
      return math.exp(1 - true_len / gen_len) * geo_mean
    return geo_mean #no penalty otherwise

def decode_text(encoded_text, tokenizer):
    vocabulary = tokenizer.get_vocabulary()
    decoded_text = [vocabulary[index] if index < len(vocabulary) else "[UNK]" for index in encoded_text]
    return " ".join(decoded_text)

def predict_latex_sequence_cnn(model, image, tokenizer):
    """
    Predict LaTeX sequence from a single image.

    Parameters:
    - model: Trained Keras model for predicting LaTeX sequence.
    - image: Input image (preprocessed to match training dimensions).
    - tokenizer: Tokenizer fitted on LaTeX sequences for decoding predictions.
    - max_seq_len: Maximum sequence length for the predicted sequence.

    Returns:
    - latex_sequence: Predicted LaTeX sequence as a string.
    """
    # Prepare input image and initialize the sequence
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    vocab_dict = {name: id for id, name in enumerate(tokenizer.get_vocabulary())}
    start_token = vocab_dict["<START>"]#word_index["[START]"]
    end_token = vocab_dict["<END>"]

    # Initial sequence with the start token
    sequence = [start_token]
    # print(max_seq_len_1)
    for _ in range(max_seq_len_1):
        # Pad the current sequence to match input length
        padded_sequence = np.pad(sequence, (0, max_seq_len_1 - len(sequence)), mode='constant')
        padded_sequence = np.expand_dims(padded_sequence, axis=0)  # Add batch dimension

        # Predict next token
        preds = model.predict([image, padded_sequence], verbose = 0)
        next_token = np.argmax(preds[0, len(sequence) - 1, :])

        # Break if end token is reached
        if next_token == end_token:
            break

        # Add the predicted token to the sequence
        sequence.append(next_token)

    # print(sequence)
    # Decode the token sequence to a string
    latex_sequence = decode_text(sequence[1:], tokenizer) #tokenizer.sequences_to_texts([sequence[1:]])[0]  # Skip the start token
    return latex_sequence

def compute_mean_metrics(predicted, truth):
   num_sequences = len(predicted)
   #uses BLEU-4 score
   mean_bleu = sum(bleu_n_score(predicted[i], truth[i], 4) for i in range(num_sequences)) / num_sequences
   mean_lev = sum(lev_distance(predicted[i], truth[i]) for i in range(num_sequences)) / num_sequences
   print("Mean BLEU-4 score: ", mean_bleu)
   print("Mean Levenshtein similarity: ", mean_lev)

def predict_latex_sequences_cnn(model, images, tokenizer):
    result = []
    i = 0
    for image in images:
        print(i)
        i+=1
        sequence_str = predict_latex_sequence_cnn(model, image, tokenizer)
        #print(sequence_str)
        result.append(sequence_str.split())
    return result

def predict_latex_sequences_transformer(images):
    result = []
    i = 0
    for image in images:
        print(i)
        i+=1
        sequence_str = transformer_model.simple_gen(image, temperature = 0)
        #print(sequence_str)
        result.append(sequence_str.split())
    return result  

test_images = train_images[:40]
print(len(test_images))
predicted_sequences = predict_latex_sequences_cnn(model, test_images, tokenizer)
#predicted_sequences = predict_latex_sequences_transformer(test_images)
test_sequences = [str.split() for str in train_latex_texts[:40]]
print("Metrics for CNN-RNN: ")
compute_mean_metrics(predicted_sequences, test_sequences)

# #predicted_sequences = predict_latex_sequences_transform(test_images)
# print("Metrics for CNN-RNN: ")
# compute_mean_metrics(predicted_sequences, test_sequences)