In [None]:
import numpy as np
import pandas as pd
import glob
import os
import requests
from docarray import DocumentArray
from docarray import dataclass
from docarray.typing import Image, Text
from docarray import Document

In [None]:
# explore unsplash dataset, if needed all csv files

path = './'
documents = ['photos', 'colors']
datasets = {}

for doc in documents:
    files = glob.glob(path + doc + ".tsv*")
    
    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)
    
    datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [None]:
datasets['photos'].isna().sum()

In [None]:
df = datasets['photos'].dropna(axis=0, subset=['ai_description'])

In [None]:
print(len(df))

In [None]:
df.isna().sum()

In [None]:
# get unique photo id's
photo_id_list = list(set(df['photo_id'].values))
print(len(photo_id_list))
# photo_id_list

In [None]:
photo_image_url_list = list(set(df['photo_image_url'].values))
print(len(photo_image_url_list))
# photo_image_url_list

In [None]:
photo_id_to_img_url_dict = {}

photo_id_to_img_url_dict = dict(zip(df.photo_id, df.photo_image_url))
print(len(photo_id_to_img_url_dict))
# photo_id_to_img_url_dict

In [None]:
unsplash_lite_img_emb_da = DocumentArray()

for photo_id in photo_id_list:
    try:
        image_path = f'resize_images/{photo_id}.jpg'
        doc = Document(uri=image_path).load_uri_to_image_tensor()
        unsplash_lite_img_emb_da.append(doc)
    except:
        continue

In [None]:
unsplash_lite_img_emb_da.summary()

In [None]:
unsplash_lite_img_emb_da[0]

### Embedding Creation by Model Inference  with ONNX

In [None]:
run_name = 'unsplash-lite-clip-run-onnx-11132022-2145'
artifact_name = 'unsplash-lite-clean-clip-onnx-model'

In [None]:
artifact_local_path = f"{artifact_name}/{run_name}.zip"

In [None]:
import finetuner

In [None]:
# Quick Test
image_da = DocumentArray([Document(uri='https://upload.wikimedia.org/wikipedia/commons/4/4e/Single_apple.png')])

clip_image_encoder = finetuner.get_model(artifact=artifact_local_path, select_model='clip-vision', is_onnx=True)

finetuner.encode(model=clip_image_encoder, data=image_da)

print(image_da.summary())
print(image_da.embeddings.shape)

##### FillUp unsplash_lite_img_da with Embeddings created by our finetuned CLIP Image model

In [None]:
finetuner.encode(model=clip_image_encoder, data=unsplash_lite_img_emb_da)

In [None]:
unsplash_lite_img_emb_da.summary()

##### remove all tensors to decrease the size of embeddings

In [None]:
#remove all tensors to decrease the size of embeddings
del unsplash_lite_img_emb_da[:, 'tensor']

unsplash_lite_img_emb_da.summary()

#### Convert local URI to Online URI, so we can show images directly from Online 

In [None]:
for doc in unsplash_lite_img_emb_da:
    try:
#         print("original:", doc.uri)
        photo_id = str(str(doc.uri).split('/')[1]).split('.')[0]
#         print("photo_id:",photo_id)
#         print("photo_image_url:", photo_id_to_img_url_dict[photo_id])
        doc.uri = photo_id_to_img_url_dict[photo_id]
#         src_uri_unsplash_lite_emb_da.append(doc)
    except:
        print("couldn't convert=>", doc.uri)
        continue

In [None]:
unsplash_lite_img_emb_da.summary()

In [None]:
unsplash_lite_img_emb_da[0]

In [None]:
unsplash_lite_img_emb_da.save_binary("../frontend/unsplash_lite_img_emb_da.bin", compress='lz4')

In [None]:
# data_da = DocumentArray.load_binary("../frontend/unsplash_lite_img_emb_da.bin", compress='lz4')