In [1]:
import pandas as pd
import json
from tqdm import tqdm

### Karpathy Splits 

In [2]:
file_path = 'dataset_flickr8k.json'
with open(file_path, 'r') as f:
    data = json.load(f)

In [3]:
file_names = []
splits = []
captions = []

for img in data['images']:
    for sent in img['sentences']:
        file_names.append(img['filename'])
        captions.append(sent['raw'])
        splits.append(img['split'])

In [4]:
df = pd.DataFrame({
    'file_name': file_names,
    'split': splits,
    'caption': captions
})

In [5]:
df.head(10)

Unnamed: 0,file_name,split,caption
0,2513260012_03d33305cf.jpg,train,A black dog is running after a white dog in th...
1,2513260012_03d33305cf.jpg,train,Black dog chasing brown dog through snow
2,2513260012_03d33305cf.jpg,train,Two dogs chase each other across the snowy gro...
3,2513260012_03d33305cf.jpg,train,Two dogs play together in the snow .
4,2513260012_03d33305cf.jpg,train,Two dogs running through a low lying body of w...
5,2903617548_d3e38d7f88.jpg,train,A little baby plays croquet .
6,2903617548_d3e38d7f88.jpg,train,A little girl plays croquet next to a truck .
7,2903617548_d3e38d7f88.jpg,train,The child is playing croquette by the truck .
8,2903617548_d3e38d7f88.jpg,train,The kid is in front of a car with a put and a ...
9,2903617548_d3e38d7f88.jpg,train,The little boy is playing with a croquet hamme...


In [6]:
df['split'].unique()

array(['train', 'val', 'test'], dtype=object)

In [7]:
df.shape

(40000, 3)

In [8]:
df.file_name.nunique()

8000

In [9]:
print(df[df['split'] == 'train'].file_name.nunique())
print(df[df['split'] == 'val'].file_name.nunique())
print(df[df['split'] == 'test'].file_name.nunique())

6000
1000
1000


In [10]:
df.to_csv("data.csv", index=False)

### Arabic

In [2]:
# make sure they have the images for the testset
df = pd.read_csv('data.csv')

In [3]:
# getting the arabic data
ar_df = pd.read_csv('ar_captions_full.txt', sep='\t', names=['file_name', 'caption'])
ar_df.head()

Unnamed: 0,file_name,caption
0,1000268201_693b08cb0e.jpg#0,طفلة صغيرة تتسلق إلى مسرح خشبي
1,1000268201_693b08cb0e.jpg#1,طفلة صغيرة تتسلق الدرج إلى منزلها
2,1000268201_693b08cb0e.jpg#2,فتاة صغيرة في ثوب وردي تذهب إلى المقصورة الخشبية
3,1001773457_577c3a7d70.jpg#0,كلب أسود وكلب ثلاثي الألوان يلعبان مع بعضهما ا...
4,1001773457_577c3a7d70.jpg#1,كلب أسود وكلب أبيض ببقع بنية يحدقان في بعضهما ...


In [4]:
ar_df.file_name = ar_df.file_name.apply(lambda x: x.split("#")[0])
ar_df.head()

Unnamed: 0,file_name,caption
0,1000268201_693b08cb0e.jpg,طفلة صغيرة تتسلق إلى مسرح خشبي
1,1000268201_693b08cb0e.jpg,طفلة صغيرة تتسلق الدرج إلى منزلها
2,1000268201_693b08cb0e.jpg,فتاة صغيرة في ثوب وردي تذهب إلى المقصورة الخشبية
3,1001773457_577c3a7d70.jpg,كلب أسود وكلب ثلاثي الألوان يلعبان مع بعضهما ا...
4,1001773457_577c3a7d70.jpg,كلب أسود وكلب أبيض ببقع بنية يحدقان في بعضهما ...


In [5]:
ar_df['split'] = 'other'

In [6]:
def split(x):
    
    if x in test_names:
        return 'test'
    elif x in val_names:
        return 'val'
    elif x in train_names:
        return 'train'
    else:
        print(x)

In [7]:
test_names = df[df['split'] == 'test'].file_name.unique()
val_names = df[df['split'] == 'val'].file_name.unique()
train_names = df[df['split'] == 'train'].file_name.unique()
len(train_names), len(val_names), len(test_names)

(6000, 1000, 1000)

In [8]:
ar_df['split'] = ar_df['file_name'].apply(lambda row: split(row))
ar_df.sample(5)

1155138244_859fd6e079.jpg
1155138244_859fd6e079.jpg
1155138244_859fd6e079.jpg
1468103286_96a6e07029.jpg
1468103286_96a6e07029.jpg
1468103286_96a6e07029.jpg
1479857177_9d4a6f38fd.jpg
1479857177_9d4a6f38fd.jpg
1479857177_9d4a6f38fd.jpg
1643915227_9f48068772.jpg
1643915227_9f48068772.jpg
1643915227_9f48068772.jpg
1797554350_20998753c0.jpg
1797554350_20998753c0.jpg
1797554350_20998753c0.jpg
1808504612_3508f3c9bb.jpg
1808504612_3508f3c9bb.jpg
1808504612_3508f3c9bb.jpg
199463720_329a802206.jpg
199463720_329a802206.jpg
199463720_329a802206.jpg
2058091220_2087270068.jpg
2058091220_2087270068.jpg
2058091220_2087270068.jpg
2087317114_cf06df5aa5.jpg
2087317114_cf06df5aa5.jpg
2087317114_cf06df5aa5.jpg
2136455112_202c093ba4.jpg
2136455112_202c093ba4.jpg
2136455112_202c093ba4.jpg
2221818690_9003756d33.jpg
2221818690_9003756d33.jpg
2221818690_9003756d33.jpg
2319197581_94f807b204.jpg
2319197581_94f807b204.jpg
2319197581_94f807b204.jpg
236095031_5cb17dc54a.jpg
236095031_5cb17dc54a.jpg
236095031_5cb17dc

Unnamed: 0,file_name,caption,split
9276,2844846111_8c1cbfc75d.jpg,شخص يتسلق صخرة بينما يقف الآخرون ويراقبون,train
434,115684808_cb01227802.jpg,ثلاثة اشخاص وزلاجة,test
21612,414773731_c3f5bf43d5.jpg,كلب أسود يركض في الثلج,val
7331,261737543_b8fdc24671.jpg,شخص يقفز من صخرة عالية,train
22958,532036676_e88b13e0a1.jpg,طفل في قميص أبيض وجينز,train


In [9]:
ar_df[ar_df['split'] == 'train'].file_name.nunique() , ar_df[ar_df['split'] == 'train'].shape

(6000, (18000, 3))

In [10]:
df[df['split'] == 'train'].file_name.nunique() , df[df['split'] == 'train'].shape

(6000, (30000, 3))

In [11]:
ar_df[ar_df['split'] == 'val'].file_name.nunique() , ar_df[ar_df['split'] == 'val'].shape

(1000, (3000, 3))

In [12]:
df[df['split'] == 'val'].file_name.nunique() , df[df['split'] == 'val'].shape

(1000, (5000, 3))

In [13]:
ar_df[ar_df['split'] == 'test'].file_name.nunique() , ar_df[ar_df['split'] == 'test'].shape

(1000, (3000, 3))

In [14]:
df[df['split'] == 'test'].file_name.nunique() , df[df['split'] == 'test'].shape

(1000, (5000, 3))

In [17]:
# save arabic data file
ar_df.to_csv('ar_data.csv', index=False)