### Deliverables 2

goal: predict simple latex formula

原始論文用 CNN，但暴力作法感覺不太可行，所以我改用 CNN + RNN，前者作為特徵擷取，後者用 LSTM。

In [1]:
import json
import itertools

data preprocessing

In [2]:
with open("./image_formula_mapping.json", "r") as f:
    img2labels: dict = json.load(f)

with open("./archive/PRINTED_TEX_230k/230k.json", "r") as f:
    token2enc: dict = json.load(f)
enc2token = { v: k for k, v in token2enc.items() }

vocab_size = len(token2enc)

loading = 0.05 # used training data for model testing, ~ 11.5k
img2labels = dict(itertools.islice(
    img2labels.items(), int(len(img2labels.keys()) * loading)
))

dataset = list(img2labels.keys()) # containing pic names

In [3]:
training_ratio = 0.8
split_ind = int(len(dataset) * training_ratio)

train_key = dataset[:split_ind]
valid_key = dataset[split_ind:]

train_data = dict(itertools.islice(img2labels.items(), split_ind))
valid_data = dict(itertools.islice(img2labels.items(), split_ind, len(img2labels)))

print(len(train_data), len(valid_data))

9372 2343


making dataset

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
from torch.nn.utils.rnn import pad_sequence

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [6]:
trans = transforms.Compose([
	transforms.Resize((80, 300)), # height, width
	transforms.Grayscale(),
	transforms.ToTensor(),
	transforms.Normalize((0.5), (0.5)),
])

def encode_formula(formula: str) -> list[int]:
	padding = token2enc["<P>"]

	return [
		int(token2enc.get(t, padding)) for t in formula.split()
	]

def decode_label(label: torch.Tensor) -> list[str]:
	return [
		enc2token.get(str(c)) for c in label.tolist()
	]


class LaTexDataset(Dataset):
	def __init__(self, img2labels: dict, token2enc: dict, transform: transforms.Compose):
		self.transform = transform
		
		self.images = list(img2labels.keys())
		self.formulas = list(img2labels.values())
		
		self.token2enc = token2enc

		# const
		self.root_dir = "./archive/PRINTED_TEX_230k/generated_png_images/"

	def __len__(self):
		return len(self.images)

	def __getitem__(self, idx):
		img_name = self.images[idx]
		formula = self.formulas[idx]
		label = torch.tensor(encode_formula(formula))
		
		# get img and apply transform
		img = Image.open(f'{self.root_dir}{img_name}')
		img = self.transform(img)
		
		return img, label

In [7]:
def collate_fn(batch):
    img, label = zip(*batch)
    
    img = torch.stack(img)
    label = pad_sequence(label, batch_first=True, padding_value=2)
    label = label.long()
    
    return img.to(device), label.to(device)

train_dataset = LaTexDataset(train_data, token2enc, transform=trans)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_dataset = LaTexDataset(valid_data, token2enc, transform=trans)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [8]:
img, label = train_dataset[0]

In [9]:
print(img.size())
print(img, label)

print(' '.join(decode_label(label)))

torch.Size([1, 80, 300])
tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]]) tensor([ 57, 543, 575,  21,  24, 577,  50, 543, 575,  21, 577,  57, 543, 575,
         24,  21, 577, 549,  50, 543, 575,  24, 577,  36, 549,  50, 543, 575,
         24, 577,  57, 543, 575,  21,  24, 577,  50, 543, 575,  21, 577,  57,
        543, 575,  21,  24, 577, 542, 575,  11,  21, 577,  10])
R _ { 1 2 } K _ { 1 } R _ { 2 1 } d K _ { 2 } = d K _ { 2 } R _ { 1 2 } K _ { 1 } R _ { 1 2 } ^ { - 1 } ,


In [12]:
class EncoderCNN(nn.Module):
	def __init__(self, feature_dim) -> None:
		super(EncoderCNN, self).__init__()

		# input size: (1, 80, 300)

		self.conv1 = nn.Conv2d(1, 64, 5)  # (64, 76, 296)
		self.pool = nn.MaxPool2d(2)  # (64, 38, 148)
		self.conv2 = nn.Conv2d(64, 128, 5)  # (128, 34, 144)
		# max pool -> (128, 17, 72)
		self.conv3 = nn.Conv2d(128, 256, 5)  # (256, 13, 68)
		# max pool -> (256, 6, 34)

		# flatten -> (256 * 6 * 34)
		self.dense1 = nn.Linear(256 * 6 * 34, feature_dim * 2)
		self.dense2 = nn.Linear(feature_dim * 2, feature_dim)

	def forward(self, x):
		x = self.pool(F.relu(self.conv1(x)))
		x = self.pool(F.relu(self.conv2(x)))
		x = self.pool(F.relu(self.conv3(x)))

		x = x.view(x.size(0), -1)
		x = F.relu(self.dense1(x))
		x = self.dense2(x)
		
		return x

class DecoderRNN(nn.Module):
	def __init__(self, embedding_dim, hidden_dim, vocab_size):
		super(DecoderRNN, self).__init__()
		self.embedding = nn.Embedding(vocab_size, embedding_dim)
		self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
		self.fc = nn.Linear(hidden_dim, vocab_size)

	def forward(self, features, formulas):
		# Embed the input formula tokens
		embeddings = self.embedding(formulas)

		# Concatenate features and embeddings along the sequence dimension
		embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
		
		# Pass through LSTM and then through the final linear layer
		lstm_out, _ = self.lstm(embeddings)
		outputs = self.fc(lstm_out)
		return outputs

class ImageToLaTeXModel(nn.Module):
	def __init__(self, encoder, decoder):
		super(ImageToLaTeXModel, self).__init__()
		self.encoder = encoder
		self.decoder = decoder

	def forward(self, images, formulas):
		# Encode the images
		features = self.encoder(images)  # Shape: [batch_size, feature_dim]
		
		# Decode to generate the LaTeX expression
		outputs = self.decoder(features, formulas[:, :-1])
		return outputs

In [13]:
# Hyperparameters
embed_size = 256
hidden_size = 512
num_epochs = 20
learning_rate = 0.001

# Model, loss, and optimizer
encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size).to(device)
model = ImageToLaTeXModel(encoder, decoder).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=2)  # 2 is assumed as <P> token
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
	print(f"epoch {epoch}")

	running_loss = 0.0
	
	for i, data in enumerate(train_loader):
		imgs, labels = data

		# Set targets: shift formula by one for teacher forcing
		targets = labels[:, 1:]

		# Forward, loss, and optimize
		outputs = model(imgs, labels[:, :-1])
		loss = criterion(outputs.view(-1, vocab_size), targets.contiguous().view(-1))
		
		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		running_loss += loss.item()

	print(f"Loss: {running_loss / len(train_loader):.4f}")

epoch 0
Loss: 3.1988
epoch 1
Loss: 2.8262
epoch 2
Loss: 2.6768
epoch 3
Loss: 2.5749
epoch 4
Loss: 2.4905
epoch 5
Loss: 2.4183
epoch 6
Loss: 2.3525
epoch 7
Loss: 2.2906
epoch 8
Loss: 2.2283
epoch 9
Loss: 2.1668
epoch 10
Loss: 2.1071
epoch 11
Loss: 2.0506
epoch 12
Loss: 1.9911
epoch 13
Loss: 1.9346
epoch 14
Loss: 1.8786
epoch 15
Loss: 1.8189
epoch 16
Loss: 1.7607
epoch 17
Loss: 1.7080
epoch 18
Loss: 1.6600
epoch 19
Loss: 1.6077


In [16]:
model.eval()  # Set the model to evaluation mode
total_loss = 0.0
correct_predictions = 0
total_samples = 0

with torch.no_grad():  # Disable gradient calculation
	for images, formulas in valid_loader:
		images, formulas = images.to(device), formulas.to(device)
		outputs = model(images, formulas[:, :-1])  # Pass images and input sequence

		# Calculate loss
		loss = criterion(outputs.view(-1, outputs.size(-1)), formulas[:, 1:].contiguous().view(-1))
		total_loss += loss.item()

		# Calculate accuracy (if applicable)
		predicted_indices = torch.argmax(outputs, dim=2)  # Get the index of the max log-probability
		correct_predictions += (predicted_indices == formulas[:, 1:].contiguous()).sum().item()
		total_samples += formulas[:, 1:].numel()  # Total number of tokens in the validation batch

		# Print images and predictions
		for i in range(len(images)):
			# Decode the actual and predicted formulas

			actual_formula = decode_label(formulas[i, :])  # Skip <S> token
			predicted_formula = decode_label(predicted_indices[i, :])

			print(f'A: {' '.join(actual_formula)}')
			print(f'P: {' '.join(predicted_formula)}')
			print()

			# break
		
		break

avg_loss = total_loss / len(valid_loader)
accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0

print(avg_loss, accuracy)

A: \frac { 1 } { b ^ { 2 } } [ l ( \omega ) - l ( \omega _ { s } ) ] < I _ { 0 } ( \kappa ) + I _ { 2 } ( \kappa ) + J ( \kappa ) . <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P> <P>
P: _ 1 } { 2 } { 2 } } ( _ _ { ) { { ( { ) ] 0 } ) ] \omega \omega \omega \omega n } ( \omega ) = 0 _ { c } ( \omega ) = 0 { { ) 0 0 { 0 { { \; \; \; \; \; \; \; f f { { { { { { { \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; \; 0 0 0 0 \phi \phi \phi \phi \phi \phi \phi \phi 0 0 0 0 0 0 0 \langle 0 0 0 0 0 0 