In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
class Vocabulary:
    def __init__(self, freq_threshold=2, max_len=100):
        self.freq_threshold = freq_threshold
        self.max_len = max_len
        self.itos = {0: "<pad>", 1: "<sos>", 2: "<eos>", 3: "<unk>"}
        self.stoi = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
        
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenize_en(text):
        return [tok.lower() for tok in araby.tokenize(text)]
    
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        
        for sentence in sentence_list:
            for word in self.tokenize_en(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1
    def numericalize(self, tokens, cap_len):
        return [self.stoi['<sos>']] + [self.stoi[token] if token in self.stoi else self.stoi["<unk>"]
                                       for token in tokens] + [self.stoi['<eos>']] + [self.stoi['<pad>']] * (self.max_len - cap_len)

    def indextostring(self, idx):
        sent_tokens = []
        for sent in idx:
            sent_tokens.append([self.itos[i] for i in sent if i not in {self.stoi['<sos>'], self.stoi['<pad>'], self.stoi['<eos>']}])
        return sent_tokens

In [4]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none


In [None]:
DATA_NAME = 'flickr8k_ar'

# local
# DATA_JSON_PATH = 'data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = 'Image-Captioning/data.json'
IMGS_PATH = 'flickr8k/images/'

In [5]:
import pandas as pd

df = pd.read_csv("ar_data.csv")
df.head()

Unnamed: 0,file_name,caption,split
0,1000268201_693b08cb0e.jpg,طفلة صغيرة تتسلق إلى مسرح خشبي,train
1,1000268201_693b08cb0e.jpg,طفلة صغيرة تتسلق الدرج إلى منزلها,train
2,1000268201_693b08cb0e.jpg,فتاة صغيرة في ثوب وردي تذهب إلى المقصورة الخشبية,train
3,1001773457_577c3a7d70.jpg,كلب أسود وكلب ثلاثي الألوان يلعبان مع بعضهما ا...,train
4,1001773457_577c3a7d70.jpg,كلب أسود وكلب أبيض ببقع بنية يحدقان في بعضهما ...,train


In [11]:
df.sample(10)

Unnamed: 0,file_name,caption,split
10630,2969380952_9f1eb7f93b.jpg,رجل يسير في طريق السباق,train
16941,3437693401_202afef348.jpg,كلب أبيض وكلب بني يلعب بعنف,train
16083,3380407617_07b53cbcce.jpg,طفل يقف في الثلج,train
13762,3223709894_97824ba76f.jpg,كلب رقيق يجري على العشب,val
3302,2151300603_248a9fe715.jpg,امرأة جالسة في الخلفية على طاولة,train
20001,3640329164_20cb245fd5.jpg,راكب الأمواج يركب الأمواج,train
7316,2616284322_b13e7c344e.jpg,الكلب صاحب الرقم أربعة أمام الكلاب الأخرى,train
16822,3430782104_34da558eba.jpg,فتاة على حصان يحاول التقاط ثور بحبل,
4038,2258662398_2797d0eca8.jpg,طفل على لوح ركوب الأمواج يركب موجة صغيرة,train
12355,3122497129_d08f5729b8.jpg,كلبان على الثلج يلعبان برباط أحمر,train


In [12]:
vocab = Vocabulary()
vocab.build_vocabulary()

TypeError: build_vocabulary() missing 1 required positional argument: 'sentence_list'