In [1]:
import numpy as np
import pandas as pd
import glob
import os
import requests
from docarray import DocumentArray
from docarray import dataclass
from docarray.typing import Image, Text
from docarray import Document



In [2]:
# explore unsplash dataset, if needed all csv files

path = '/home/aswin/data/unsplash-research-dataset-lite-latest/'
documents = ['photos', 'colors']
datasets = {}

for doc in documents:
    files = glob.glob(path + doc + ".tsv*")
    
    subsets = []
    for filename in files:
        df = pd.read_csv(filename, sep='\t', header=0)
        subsets.append(df)
    
    datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [3]:
datasets['photos'].isna().sum()

photo_id                              0
photo_url                             0
photo_image_url                       0
photo_submitted_at                    0
photo_featured                        0
photo_width                           0
photo_height                          0
photo_aspect_ratio                    0
photo_description                 14098
photographer_username                 0
photographer_first_name               0
photographer_last_name             1582
exif_camera_make                   2812
exif_camera_model                  2852
exif_iso                           3192
exif_aperture_value                3600
exif_focal_length                  3503
exif_exposure_time                 3215
photo_location_name               15309
photo_location_latitude           17853
photo_location_longitude          17856
photo_location_country            16211
photo_location_city               18856
stats_views                           0
stats_downloads                       0


In [4]:
df = datasets['photos'].dropna(axis=0, subset=['ai_description'])

In [None]:
print(len(df))

In [5]:
df.isna().sum()

photo_id                              0
photo_url                             0
photo_image_url                       0
photo_submitted_at                    0
photo_featured                        0
photo_width                           0
photo_height                          0
photo_aspect_ratio                    0
photo_description                 13325
photographer_username                 0
photographer_first_name               0
photographer_last_name             1472
exif_camera_make                   2631
exif_camera_model                  2668
exif_iso                           2984
exif_aperture_value                3371
exif_focal_length                  3281
exif_exposure_time                 3008
photo_location_name               14457
photo_location_latitude           16824
photo_location_longitude          16827
photo_location_country            15302
photo_location_city               17807
stats_views                           0
stats_downloads                       0


In [6]:
# get unique photo id's
photo_id_list = list(set(df['photo_id'].values))
print(len(photo_id_list))
# photo_id_list

23641


In [7]:
photo_image_url_list = list(set(df['photo_image_url'].values))
print(len(photo_image_url_list))
# photo_image_url_list

23641


In [8]:
photo_id_to_img_url_dict = {}

photo_id_to_img_url_dict = dict(zip(df.photo_id, df.photo_image_url))
print(len(photo_id_to_img_url_dict))
# photo_id_to_img_url_dict

23641


In [9]:
unsplash_lite_img_emb_da = DocumentArray()

for photo_id in photo_id_list:
    try:
        image_path = f'resize_images/{photo_id}.jpg'
        doc = Document(uri=image_path).load_uri_to_image_tensor()
        unsplash_lite_img_emb_da.append(doc)
    except:
        continue

In [10]:
unsplash_lite_img_emb_da.summary()

In [11]:
unsplash_lite_img_emb_da[0]

IndexError: list index out of range

### Embedding Creation by Model Inference  with ONNX

In [12]:
run_name = 'unsplash-lite-clip-run-onnx-11132022-2145'
artifact_name = 'unsplash-lite-clean-clip-onnx-model'

In [13]:
artifact_local_path = f"{artifact_name}/{run_name}.zip"

In [14]:
import finetuner

In [15]:
# Quick Test
image_da = DocumentArray([Document(uri='https://upload.wikimedia.org/wikipedia/commons/4/4e/Single_apple.png')])

clip_image_encoder = finetuner.get_model(artifact=artifact_local_path, select_model='clip-vision', is_onnx=True)

finetuner.encode(model=clip_image_encoder, data=image_da)

print(image_da.summary())
print(image_da.embeddings.shape)

2022-11-18 15:39:40.882775: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-18 15:39:41.625517: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-18 15:39:41.907831: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-18 15:39:44.669831: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

ArtifactNotFound: Artifact unsplash-lite-clean-clip-onnx-model/unsplash-lite-clip-run-onnx-11132022-2145.zip not found, details: 40001: ParamValidationError: 40001

##### FillUp unsplash_lite_img_da with Embeddings created by our finetuned CLIP Image model

In [None]:
finetuner.encode(model=clip_image_encoder, data=unsplash_lite_img_emb_da)

In [None]:
unsplash_lite_img_emb_da.summary()

##### remove all tensors to decrease the size of embeddings

In [None]:
#remove all tensors to decrease the size of embeddings
del unsplash_lite_img_emb_da[:, 'tensor']

unsplash_lite_img_emb_da.summary()

#### Convert local URI to Online URI, so we can show images directly from Online 

In [None]:
for doc in unsplash_lite_img_emb_da:
    try:
#         print("original:", doc.uri)
        photo_id = str(str(doc.uri).split('/')[1]).split('.')[0]
#         print("photo_id:",photo_id)
#         print("photo_image_url:", photo_id_to_img_url_dict[photo_id])
        doc.uri = photo_id_to_img_url_dict[photo_id]
#         src_uri_unsplash_lite_emb_da.append(doc)
    except:
        print("couldn't convert=>", doc.uri)
        continue

In [None]:
unsplash_lite_img_emb_da.summary()

In [None]:
unsplash_lite_img_emb_da[0]

In [None]:
unsplash_lite_img_emb_da.save_binary("../frontend/unsplash_lite_img_emb_da.bin", compress='lz4')

In [None]:
# data_da = DocumentArray.load_binary("../frontend/unsplash_lite_img_emb_da.bin", compress='lz4')