In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
import nltk
from PIL import Image
import os
import json
import numpy as np

# Download NLTK Tokenizer
torch.manual_seed(42)
nltk.download('punkt')

# Hyperparameters
EMBED_SIZE = 256
HIDDEN_SIZE = 512
NUM_LAYERS = 1
LEARNING_RATE = 1e-3
NUM_EPOCHS = 5
BATCH_SIZE = 32

# Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])

# Custom Dataset
class CocoDataset(Dataset):
    def __init__(self, img_folder, captions_file, transform=None):
        self.img_folder = img_folder
        self.transform = transform
        
        with open(captions_file, 'r') as f:
            self.captions_data = json.load(f)
        
        self.images = list(self.captions_data.keys())
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        img_name = self.images[idx]
        img_path = os.path.join(self.img_folder, img_name)
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        caption = self.captions_data[img_name]
        return image, caption

# Encoder (CNN Feature Extractor)
class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        resnet = models.resnet50(pretrained=True)
        self.resnet = nn.Sequential(*list(resnet.children())[:-1])
        self.fc = nn.Linear(resnet.fc.in_features, embed_size)
        self.relu = nn.ReLU()
    
    def forward(self, images):
        features = self.resnet(images)
        features = features.view(features.size(0), -1)
        features = self.fc(features)
        return self.relu(features)

# Decoder (LSTM for Caption Generation)
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, features, captions):
        embeddings = self.embed(captions)
        inputs = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        lstm_out, _ = self.lstm(inputs)
        outputs = self.fc(lstm_out)
        return outputs

# Training Loop
def train_model():
    dataset = CocoDataset("images/", "captions.json", transform)
    data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    
    vocab_size = 1000  # Assume a fixed vocab size for simplicity
    encoder = EncoderCNN(EMBED_SIZE).cuda()
    decoder = DecoderRNN(EMBED_SIZE, HIDDEN_SIZE, vocab_size, NUM_LAYERS).cuda()
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=LEARNING_RATE)
    
    for epoch in range(NUM_EPOCHS):
        for i, (images, captions) in enumerate(data_loader):
            images = images.cuda()
            captions = captions.cuda()
            
            features = encoder(images)
            outputs = decoder(features, captions[:, :-1])
            loss = criterion(outputs.view(-1, vocab_size), captions[:, 1:].reshape(-1))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if i % 10 == 0:
                print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Step [{i}/{len(data_loader)}], Loss: {loss.item():.4f}')
    
    torch.save(encoder.state_dict(), 'encoder.pth')
    torch.save(decoder.state_dict(), 'decoder.pth')


    train_model()
    print("done")
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mayank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
