In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od

od.download('https://www.kaggle.com/datasets/ingbiodanielh/vizwiz')

In [None]:
!ls /content/vizwiz/data/Images | wc -l

In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
[0mCollecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-0rrjo_x5
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-0rrjo_x5
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25ldone
[?25h  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369410 sha256=a317cc0e15a20e0805963f7e3cb8b59264ebb4079c3ed37ac02b9541fd21b833
  Stored in directory: /tmp/pip-ephem-wheel-cache-w8_h42qs/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b11

In [2]:
import clip
import json
import numpy as np
import torch
import torchtext
from PIL import Image
from IPython.display import Image as IM
from IPython.display import display
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

device = "cuda" if torch.cuda.is_available() else "cpu"

In [54]:
model, preprocess = clip.load('ViT-B/32', device=device)

In [None]:
img = Image.open('/content/vizwiz/data/Images/VizWiz_train_000000000000.jpg')

In [None]:
img.show()

In [None]:
display(IM(filename='/kaggle/input/vizwiz/data/Images/VizWiz_train_000000000001.jpg'))

In [4]:
data = None

with open('/kaggle/input/vizwiz/data/Annotations/train.json') as f:
  data = json.load(f)

In [52]:
with open('/kaggle/input/vizwiz/data/Annotations/val.json') as f:
  val_data = json.load(f)

In [5]:
def get_answers_from_dict(answers_dict):
    answers = [entry['answer'] for entry in answers_dict]
    return answers

In [6]:
def get_answer(model, preprocess, img_path, answers):
    image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
    text = clip.tokenize(answers).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()

    return np.argmax(probs)

In [None]:
get_answer(model, preprocess, '/kaggle/input/vizwiz/data/Images/VizWiz_train_000000000001.jpg', get_answers_from_dict(data[1]['answers']))

In [None]:
get_answers_from_dict(data[1]['answers'])

In [None]:
df = []

for i,entry in enumerate(data):
    idx = get_answer(model, preprocess, '/kaggle/input/vizwiz/data/Images/'+entry['image'], get_answers_from_dict(entry['answers']))
    df.append([entry['image'], entry['answerable'], entry['question'], entry['answer_type'], entry['answers'][idx]['answer']])
    if i%1000==0:
        print('Checkpoint at: ' + str(i))
        
df = np.array(df)

In [None]:
with open('/kaggle/working/df.npy', 'wb') as f:
    np.save(f,df)

In [7]:
with open('/kaggle/input/dfasdasd/df.npy', 'rb') as f: 
    df = np.load(f)

In [8]:
answers = np.array([entry[4] for entry in df])

In [9]:
df.shape

(20000, 5)

In [12]:
answer_types = np.array([entry[3] for entry in df])

In [55]:
val_df = []

for i,entry in enumerate(val_data):
    idx = get_answer(model, preprocess, '/kaggle/input/vizwiz/data/Images/'+entry['image'], get_answers_from_dict(entry['answers']))
    val_df.append([entry['image'], entry['answerable'], entry['question'], entry['answer_type'], entry['answers'][idx]['answer']])
    if i%1000==0:
        print('Checkpoint at: ' + str(i))
        
val_df = np.array(val_df)

Checkpoint at: 0
Checkpoint at: 1000
Checkpoint at: 2000
Checkpoint at: 3000


In [62]:
with open('/kaggle/working/val_df1.npy', 'wb') as f:
    np.save(f,val_df)

In [None]:
with open('/kaggle/input/dfasdasd/val_df.npy', 'rb') as f: 
    val_df = np.load(f)

In [57]:
val_answers = np.array([entry[4] for entry in val_df])

In [58]:
val_answer_types = np.array([entry[3] for entry in val_df])

In [60]:
def create_samples(model, preprocess, img_path, question, device, rotate=True):
    samples = []
    img = Image.open(img_path)
    
    encoded_q = model.encode_text(clip.tokenize(question).to(device))
    encoded_img = model.encode_image(preprocess(img).unsqueeze(0).to(device))
    samples.append(torch.hstack([encoded_img, encoded_q]))
    if not rotate:
        return torch.vstack(samples)

    # Rotate Image By 90 Degree
    rotated_image1 = img.rotate(90)
    encoded_img = model.encode_image(preprocess(rotated_image1).unsqueeze(0).to(device))
    samples.append(torch.hstack([encoded_img, encoded_q]))

    # Rotate Image By 180 Degree
    rotated_image2 = img.rotate(180)
    encoded_img = model.encode_image(preprocess(rotated_image2).unsqueeze(0).to(device))
    samples.append(torch.hstack([encoded_img, encoded_q]))

    # Rotate Image By 270 Degree
    rotated_image3 = img.rotate(270)
    encoded_img = model.encode_image(preprocess(rotated_image3).unsqueeze(0).to(device))
    samples.append(torch.hstack([encoded_img, encoded_q]))
        
    return torch.vstack(samples)
        

In [None]:
X = []

with torch.no_grad():
  for i, entry in enumerate(df):
    X.extend(create_samples(model, preprocess, '/kaggle/input/vizwiz/data/Images/'+entry[0], entry[2], device))
    if i%1000==0:
        print('Checkpoint at: ' + str(i))

X = torch.vstack(X)

In [None]:
torch.save(X,'/kaggle/working/X.pt')

In [14]:
X = torch.load('/kaggle/input/dfasdasd/X.pt')

In [61]:
X_val = []

with torch.no_grad():
  for i, entry in enumerate(val_df):
    X_val.extend(create_samples(model, preprocess, '/kaggle/input/vizwiz/data/Images/'+entry[0], entry[2], device, rotate=False))
    if i%1000==0:
        print('Checkpoint at: ' + str(i))

X_val = torch.vstack(X_val)

Checkpoint at: 0
Checkpoint at: 1000
Checkpoint at: 2000
Checkpoint at: 3000


In [63]:
torch.save(X_val,'/kaggle/working/X_val.pt')

In [None]:
X_val = torch.load('/kaggle/input/dfasdasd/X_val.pt')

In [15]:
lb_types = preprocessing.LabelBinarizer()

encoded_answer_types = lb_types.fit_transform(answer_types)

print(encoded_answer_types[10])

[0 1 0 0]


In [16]:
lb_types.classes_

array(['number', 'other', 'unanswerable', 'yes/no'], dtype='<U12')

In [17]:
lb_answers = preprocessing.LabelBinarizer()

encoded_answers = lb_answers.fit_transform(answers)

print(encoded_answers[10])

[0 0 0 ... 0 0 0]


In [42]:
lb_answers.classes_.shape

(11778,)

In [18]:
y = [(a_type, a) for a_type, a in zip(encoded_answer_types, encoded_answers)]

In [37]:
tmp_y = y
y = []
for t in tmp_y:
    for i in range(4):
        y.append(t)

In [65]:
val_encoded_answer_types = lb_types.transform(val_answer_types)
val_encoded_answers = lb_answers.transform(val_answers)
y_val = [(a_type, a) for a_type, a in zip(val_encoded_answer_types, val_encoded_answers)]

# This is a problem :)

In [73]:
c = 0
for ans in val_encoded_answers:
    c += ans.sum()
print(c)
print(len(val_encoded_answers))

1553
3173


In [74]:
c = 0
for ans in val_encoded_answer_types:
    c += ans.sum()
print(c)
print(len(val_encoded_answer_types))

3173
3173


In [25]:
X = X.cpu().detach().numpy()

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [39]:
from torch.utils.data import Dataset, DataLoader

# Custom Dataset class
class VQADataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.atypes = []
        self.answers = []
        for atype, ans in labels:
          self.atypes.append(atype)
          self.answers.append(ans)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = torch.Tensor(self.data[index])
        at, a = torch.Tensor(self.atypes[index]), torch.Tensor(self.answers[index])

        return x, at, a

In [71]:
train_set = DataLoader(VQADataset(X_train, y_train), batch_size=512)
test_set = DataLoader(VQADataset(X_test, y_test), batch_size=512)
val_set = DataLoader(VQADataset(X_val, y_val), batch_size=512)

In [67]:
class VQA_Network(nn.Module):
    def __init__(self, num_classes, vocab_size, hidden_dim, embedding_dim):
        super(VQA_Network, self).__init__()

        self.fc = nn.Linear(embedding_dim, hidden_dim)
        self.fc2_answers = nn.Linear(hidden_dim, vocab_size)

        self.fc2_aux = nn.Linear(hidden_dim, num_classes)
        self.fc3_aux = nn.Linear(num_classes, vocab_size)

        self.norm = nn.LayerNorm(hidden_dim)
        self.dropout = nn.Dropout()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = x.to(dtype=torch.float32)
        x = self.fc(x)
        x = self.norm(x)
        x = self.dropout(x)

        atype = self.softmax(self.fc2_aux(x))
        aux = self.fc3_aux(atype)

        answers = self.fc2_answers(x)

        answers = answers * aux

        # answers = answers.to(dtype=torch.int64)

        return self.softmax(answers), atype


In [72]:
num_classes = 4
vocab_size = 11778
hidden_dim = 512
embedding_dim = 1024

num_epochs = 200

# Instantiate the model
model = VQA_Network(num_classes, vocab_size, hidden_dim, embedding_dim).to(device)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss_a = 0.0
    running_loss_at = 0.0
    acc_a = 0
    acc_at = 0
    val_running_loss_a = 0.0
    val_running_loss_at = 0.0
    val_acc_a = 0
    val_acc_at = 0

    for xs, atypeset, answerset in train_set:
        # Move data to the device
        xs = xs.to(device)
        atypeset = atypeset.to(device)
        answerset = answerset.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Forward pass
        answers, atypes = model(xs)
        # print(xs.size())
        # print(labelset.size())
        # Compute loss
        answer_loss = criterion(answers, answerset)

        # Backward pass
        answer_loss.backward(retain_graph=True)

        # Compute loss
        atype_loss = criterion(atypes, atypeset)

        # Backward pass
        atype_loss.backward()

        # Update weights
        optimizer.step()

        # Update running loss
        running_loss_a += answer_loss.item()
        running_loss_at += atype_loss.item()
        acc_at += (torch.argmax(atypes, 1) == torch.argmax(atypeset, 1)).float().sum()
        acc_a += (torch.argmax(answers, 1) == torch.argmax(answerset, 1)).float().sum()

    acc_a = acc_a / len(y_train)
    acc_at = acc_at / len(y_train)
    # Calculate average loss for the epoch
    average_loss_a = running_loss_a / len(y_train)
    average_loss_at = running_loss_at / len(y_train)

    # Print progress
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss_a: {average_loss_a:.4f}, Loss_at: {average_loss_at:.4f}", end=' ')
    print(f"Acc_a: {acc_a:.4f}, Acc_at: {acc_at:.4f}", end='\t')
    
    with torch.no_grad():
        model.eval()
        for xs, atypeset, answerset in val_set:
            # Move data to the device
            xs = xs.to(device)
            atypeset = atypeset.to(device)
            answerset = answerset.to(device)

            # Forward pass
            answers, atypes = model(xs)
            # print(xs.size())
            # print(labelset.size())
            # Compute loss
            answer_loss = criterion(answers, answerset)

            # Compute loss
            atype_loss = criterion(atypes, atypeset)

            # Update running loss
            val_running_loss_a += answer_loss.item()
            val_running_loss_at += atype_loss.item()
            val_acc_at += (torch.argmax(atypes, 1) == torch.argmax(atypeset, 1)).float().sum()
            val_acc_a += (torch.argmax(answers, 1) == torch.argmax(answerset, 1)).float().sum()

        val_acc_a = val_acc_a / len(y_val)
        val_acc_at = val_acc_at / len(y_val)
        # Calculate average loss for the epoch
        val_average_loss_a = val_running_loss_a / len(y_val)
        val_average_loss_at = val_running_loss_at / len(y_val)

        # Print progress
        print(f"Validation: Loss_a: {val_average_loss_a:.4f}, Loss_at: {val_average_loss_at:.4f}", end=' ')
        print(f"Acc_a: {val_acc_a:.4f}, Acc_at: {val_acc_at:.4f}")


# Training complete


Epoch [1/200], Loss_a: 0.0184, Loss_at: 0.0027 Acc_a: 0.0088, Acc_at: 0.5099	Validation: Loss_a: 0.0102, Loss_at: 0.0030 Acc_a: 0.0552, Acc_at: 0.5979
Epoch [2/200], Loss_a: 0.0184, Loss_at: 0.0027 Acc_a: 0.0419, Acc_at: 0.5653	Validation: Loss_a: 0.0102, Loss_at: 0.0030 Acc_a: 0.0637, Acc_at: 0.5830
Epoch [3/200], Loss_a: 0.0184, Loss_at: 0.0027 Acc_a: 0.0498, Acc_at: 0.5694	Validation: Loss_a: 0.0101, Loss_at: 0.0030 Acc_a: 0.0561, Acc_at: 0.6061
Epoch [4/200], Loss_a: 0.0184, Loss_at: 0.0027 Acc_a: 0.0590, Acc_at: 0.5708	Validation: Loss_a: 0.0101, Loss_at: 0.0030 Acc_a: 0.0539, Acc_at: 0.6120
Epoch [5/200], Loss_a: 0.0184, Loss_at: 0.0027 Acc_a: 0.0754, Acc_at: 0.5806	Validation: Loss_a: 0.0101, Loss_at: 0.0030 Acc_a: 0.0596, Acc_at: 0.5846
Epoch [6/200], Loss_a: 0.0183, Loss_at: 0.0027 Acc_a: 0.0984, Acc_at: 0.5566	Validation: Loss_a: 0.0101, Loss_at: 0.0030 Acc_a: 0.0608, Acc_at: 0.5736
Epoch [7/200], Loss_a: 0.0183, Loss_at: 0.0027 Acc_a: 0.1280, Acc_at: 0.5501	Validation: Loss_

KeyboardInterrupt: 

In [None]:
y_test[3]

In [None]:
print(model(X_test[3]))