In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import os
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class DevanagariDataset(Dataset):
    def __init__(self, root_dir, transform=None, max_images_per_character=100):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []
        self.max_images_per_character = max_images_per_character
        self.load_images()

    def load_images(self):
        for outer_folder_name in os.listdir(self.root_dir):
            outer_folder_path = os.path.join(self.root_dir, outer_folder_name)
            if os.path.isdir(outer_folder_path):
                for folder_name in os.listdir(outer_folder_path):
                    folder_path = os.path.join(outer_folder_path, folder_name)
                    label = folder_name.split('_')[1]  
                    if os.path.isdir(folder_path):
                        # Limit the number of images loaded per character
                        images_loaded = 0
                        for img_name in os.listdir(folder_path):
                            img_path = os.path.join(folder_path, img_name)
                            self.image_paths.append(img_path)
                            self.labels.append(int(label))  
                            images_loaded += 1
                            if images_loaded >= self.max_images_per_character:
                                break  # Stop loading more images for this character

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert("L")  
        if self.transform:
            image = self.transform(image)
        return image, label


# Define transformations for preprocessing
transform = transforms.Compose([
    transforms.Resize((128, 128)),  
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))  
])

# Create the dataset with a limit on the number of images per character
train_dataset = DevanagariDataset(
    root_dir=r'C:\Users\asus\Desktop\chatbot\IIT ROORKEE\Train', 
    transform=transform,
    max_images_per_character=10  # Limit to 10 images per character
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Check the shape of images and labels in the first batch
for images, labels in train_loader:
    print("Batch of images shape:", images.shape)
    print("Batch of labels:", labels)
    break


Batch of images shape: torch.Size([32, 1, 128, 128])
Batch of labels: tensor([ 7,  1,  7, 19,  4, 33, 33, 19, 16, 26, 32, 22, 17, 28, 28, 13,  4,  6,
         1,  7,  9,  1,  5, 31, 25,  4, 16, 16, 31,  2,  7, 18])


In [3]:
device = torch.device("cpu")  # or "cuda" if you want to use GPU
model.to(device)


GOTQwenForCausalLM(
  (model): GOTQwenModel(
    (embed_tokens): Embedding(151860, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
    (vision_tower

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from torch import nn, optim


tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True)
model = model.eval().cuda()
device = torch.device("cpu")  # or "cuda" if you want to use GPU
model.to(device)

num_classes = 46  
model.classifier = nn.Linear(model.config.hidden_size, num_classes).cuda()


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            images = images.cuda()
            labels = labels.cuda()

            optimizer.zero_grad()

            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader)}")


train_model(model, train_loader, criterion, optimizer)


In [4]:
%pip install tiktoken

Collecting tiktokenNote: you may need to restart the kernel to use updated packages.

  Downloading tiktoken-0.7.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Downloading tiktoken-0.7.0-cp311-cp311-win_amd64.whl (799 kB)
   ---------------------------------------- 0.0/799.0 kB ? eta -:--:--
   --------- ------------------------------ 184.3/799.0 kB 5.4 MB/s eta 0:00:01
   -------------------- ------------------- 409.6/799.0 kB 4.2 MB/s eta 0:00:01
   ------------------------------- -------- 624.6/799.0 kB 4.9 MB/s eta 0:00:01
   ---------------------------------------- 799.0/799.0 kB 4.2 MB/s eta 0:00:00
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [6]:
%pip install verovio

Collecting verovioNote: you may need to restart the kernel to use updated packages.

  Downloading verovio-4.3.1-cp311-cp311-win_amd64.whl.metadata (9.2 kB)
Downloading verovio-4.3.1-cp311-cp311-win_amd64.whl (4.8 MB)
   ---------------------------------------- 0.0/4.8 MB ? eta -:--:--
   - -------------------------------------- 0.2/4.8 MB 5.0 MB/s eta 0:00:01
   --- ------------------------------------ 0.4/4.8 MB 4.6 MB/s eta 0:00:01
   ---- ----------------------------------- 0.6/4.8 MB 4.5 MB/s eta 0:00:01
   ------ --------------------------------- 0.8/4.8 MB 4.7 MB/s eta 0:00:01
   ------ --------------------------------- 0.8/4.8 MB 4.7 MB/s eta 0:00:01
   ------- -------------------------------- 0.8/4.8 MB 3.0 MB/s eta 0:00:02
   -------- ------------------------------- 1.0/4.8 MB 3.2 MB/s eta 0:00:02
   -------- ------------------------------- 1.0/4.8 MB 2.8 MB/s eta 0:00:02
   -------- ------------------------------- 1.0/4.8 MB 2.8 MB/s eta 0:00:02
   -------- -----------------