In [None]:
import os
import pandas as pd
from PIL import Image
import torch
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from torch.utils.data import Dataset, DataLoader

class MathExprDataset(Dataset):
    def __init__(self, images_dir, annotations_file, transform=None):
        self.images_dir = images_dir
        # Assuming the delimiter is a tab character
        self.annotations = pd.read_csv(annotations_file, delimiter='\t', header=None, names=['filename', 'latex'])
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = os.path.join(self.images_dir, self.annotations.iloc[idx, 0] + '.bmp')
        image = Image.open(img_name)
        latex = self.annotations.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        return image, latex

transform = Compose([
    Resize((224, 224)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = MathExprDataset(images_dir='/content/drive/MyDrive/Major_Project_Final-Yr/off_image_train', annotations_file='/content/drive/MyDrive/Major_Project_Final-Yr/trainLatex.txt', transform=transform)


In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import ViTFeatureExtractor, GPT2Tokenizer, GPT2LMHeadModel
from transformers import ViTModel, ViTConfig


# Initialize the feature extractor and tokenizer
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Load pre-trained models
vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
## Alternative Code For Importing Pretrained GPT2 and ViT

from transformers import ViTModel, GPT2LMHeadModel, GPT2Tokenizer, ViTFeatureExtractor

try:
    feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    vit = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
    gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')
except Exception as e:
    print(f"Failed to load models from Hugging Face: {e}")
    # Implement alternative logic here, if any


Failed to load models from Hugging Face: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like google/vit-base-patch16-224-in21k is not the path to a directory containing a file named preprocessor_config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.


In [None]:
def predict_latex(image_path, vit_model, gpt2_model, feature_extractor, tokenizer):
    image = Image.open(image_path)
    inputs = feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        features = vit_model(**inputs).last_hidden_state
        # Further processing to align features with GPT-2 input requirements needed
        # This is a simplified illustration; actual implementation will vary
        outputs = gpt2_model.generate(features)
        predicted_latex = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_latex

# Example usage
image_path = 'path/to/image.bmp'
predicted_latex = predict_latex(image_path, vit, gpt2, feature_extractor, tokenizer)
print(predicted_latex)


In [None]:
#

### Another Approach

Preprocess Theb Data


In [None]:
import os
import pandas as pd
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from torch.utils.data import Dataset

class MathExprDataset(Dataset):
    def __init__(self, images_dir, annotations_file, transform=None):
        self.images_dir = images_dir
        self.annotations = pd.read_csv(annotations_file, delimiter='\t', names=['filename', 'latex'], header=None)
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_path = os.path.join(self.images_dir, self.annotations.iloc[idx, 0] + '.bmp')
        image = Image.open(img_path).convert("RGB")
        latex = self.annotations.iloc[idx, 1]
        if self.transform:
            image = self.transform(image)
        return image, latex

transform = Compose([
    Resize((224, 224)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


Model SetUp

In [None]:
from transformers import ViTModel, GPT2LMHeadModel, ViTFeatureExtractor, GPT2Tokenizer
import torch
import torch.nn as nn

class ViTGPT2Model(nn.Module):
    def __init__(self, vit_model_name, gpt2_model_name):
        super().__init__()
        self.vit = ViTModel.from_pretrained(vit_model_name)
        self.gpt2 = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)
        # Customize the model to integrate ViT and GPT-2
        # Example: adapting ViT's output features to GPT-2's input
        self.hidden_size = self.vit.config.hidden_size
        self.latex_embeddings = nn.Linear(self.hidden_size, self.gpt2.config.vocab_size)

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, labels=None):
        outputs = self.vit(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state
        # Assume taking the CLS token's output for simplicity
        cls_outputs = last_hidden_states[:, 0, :]
        logits = self.latex_embeddings(cls_outputs)

        # Decode with GPT-2 (not shown: requires handling sequence generation)
        # This is a placeholder for the actual decoding process
        return logits


Data Loading

In [None]:
from torch.utils.data import DataLoader

# Example paths, replace with your actual paths
images_dir = "/content/drive/MyDrive/Major_Project_Final-Yr/off_image_train"
annotations_file = "/content/drive/MyDrive/Major_Project_Final-Yr/trainLatex.txt"

dataset = MathExprDataset(images_dir, annotations_file, transform=transform)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)


Training Loop

In [None]:
model = ViTGPT2Model('google/vit-base-patch16-224-in21k', 'gpt2')
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

for epoch in range(5):
    for images, latex_codes in data_loader:
        # Process images through ViT
        # Convert LaTeX codes to token IDs for GPT-2
        # Calculate loss (requires custom implementation)
        # Backpropagation and optimizer step
        pass  # Placeholder for actual implementation


Prediction Result

In [None]:
from PIL import Image
from transformers import ViTFeatureExtractor, GPT2Tokenizer

def predict_latex_code(model, image_path, feature_extractor, tokenizer):
    model.eval()
    # Convert the image to grayscale
    image = Image.open(image_path).convert("L")
    # Since ViT expects three channels (RGB), we need to convert the grayscale image back to RGB
    image = image.convert("RGB")
    input_ids = feature_extractor(images=image, return_tensors="pt").pixel_values
    with torch.no_grad():
        logits = model(input_ids)
        # The actual decoding process might need to be adjusted based on how the logits are processed to generate LaTeX tokens
        # The following is a placeholder and might not directly apply
        predicted_token_ids = logits.argmax(-1)
        predicted_latex_code = tokenizer.decode(predicted_token_ids, skip_special_tokens=True)
    return predicted_latex_code


### Another Approach 3

In [None]:

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


# predict_step(['doctor.e16ba4e4.jpg'])


A