In [None]:
import numpy as np
import pandas as pd
import glob
import os
import requests
from docarray import DocumentArray
from docarray import dataclass
from docarray.typing import Image, Text
from docarray import Document

In [None]:
# Download Unsplash lite dataset
# !wget https://unsplash.com/data/lite/latest

In [None]:
# explore unsplash dataset, all csv files

path = './'
documents = ['photos', 'colors']
datasets = {}

for doc in documents:
    files = glob.glob(path + doc + ".tsv*")
    
    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)
    
    datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [None]:
datasets['photos']

#### Data Cleaning

In [None]:
datasets['photos'].isna().sum()

In [None]:
df = datasets['photos'].dropna(axis=0, subset=['ai_description'])

In [None]:
print(len(df))

In [None]:
df.isna().sum()

In [None]:
# get unique photo id's
photo_id_list = list(set(df['photo_id'].values))
print(len(photo_id_list))
photo_id_list

In [None]:
# get all ai descriptions
ai_description_list = df['ai_description'].values
print(len(ai_description_list))
ai_description_list

#### Dataset(DocumentArray) Creation

In [None]:
@dataclass
class Photo:
    image: Image
    description: Text

In [None]:
da = DocumentArray()
for photo_id, ai_description in zip(photo_id_list, ai_description_list):
    try:
        photo = Photo(
            image = f'resize_images/{photo_id}.jpg',
            description = f'{ai_description}',
        )
        doc = Document(photo)
        da.append(doc)
    except:
        continue

In [None]:
da.summary()

In [None]:
da[0:1][0]

In [None]:
# create train and eval da
train_da = da[:22599]
eval_da = da[22600:]

In [None]:
train_da.summary()

In [None]:
eval_da.summary()

In [None]:
# push local da to jina cloud for storage
eval_da.push(name='unsplash-lite-clean-eval-data-clip', show_progress=True)

In [None]:
# push local da to jina cloud for storage
train_da.push(name='unsplash-lite-clean-train-data-clip', show_progress=True)

#### Finish