In [1]:
import sys
sys.path.append('/app/')  # noqa
sys.path.append('/app/loopa')  # noqa

from jina import Document, DocumentArray, Flow, Executor, requests
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import argparse
from loopa.executors.cache import SBERTEncoderCache
from loopa.executors.index import FaissIndexer, IndexMerger
from loopa.config import CUDA_IF_AVAILABLE
import torch
import hashlib

def my_hash(s): 
    return str(int(hashlib.md5(str(s).encode('utf-8')).hexdigest(), 16))

In [2]:
torch.multiprocessing.set_start_method('spawn', force=True) 

In [3]:
OUT_DIR = Path("/data/imgflip/scrap_language_image_pairs_20220209/")
OUT_TEMPLATE_TEXTS_CSV = OUT_DIR/"template_texts.csv"

In [47]:
JINA_SBERT_EMBEDDING_TEMPLATE_TEXT_COLLECTION = "01_sbert_all_mpnet_base_v2_imgflip_template_100k_embeddings"
MONGO_EMBEDDING_DB_NAME = 'memlish_db'

In [46]:
def text_input_docs(df):    
    for idx, row in df.iterrows():
        template_name = row["template_img_name"]
        try:
            template_text = row["text"]
            document_id = f"{template_name}_{my_hash(template_text)}"
            doc = Document(id=document_id, text=template_text)# tags={'filename': str(template_img_path)})
        except Exception as e:
            print(document_id)
            print(template_text)
            print(str(e))
        
        yield doc 

In [6]:
templates_df = pd.read_csv(OUT_TEMPLATE_TEXTS_CSV)

In [7]:
templates_df = templates_df[~templates_df['text'].isna()]

In [8]:
templates_df = templates_df[~templates_df.duplicated(["text", "template_img_name"])]

In [37]:
templates_df.head(2)

Unnamed: 0,url,text,alt,views,upvotes,comments,meme_home_page,template_name,template_img_name
0,i.imgflip.com/61ov27.jpg,accepting that ur child made a good point; MOM...,Moms be like | accepting that ur child made a...,7208,106,7,/i/61ov27,Drake-Hotline-Bling,Drake-Hotline-Bling.jpg
1,i.imgflip.com/61soau.jpg,Ground children by sending them to bed early a...,Mexican meme | Ground children by sending the...,5271,65,5,/i/61soau,Drake-Hotline-Bling,Drake-Hotline-Bling.jpg


In [48]:
k_to_take = 100
groups = []

for _, group in templates_df.groupby(['template_img_name']):
    selected_sorted = group.sort_values(by=['views'], ascending=False)[:k_to_take]
    groups.append(selected_sorted)

In [49]:
df = pd.concat(groups).reset_index(drop=True)
df.head(2)

Unnamed: 0,url,text,alt,views,upvotes,comments,meme_home_page,template_name,template_img_name
0,i.imgflip.com/25p6ij.jpg,"I WON $10,000,000 IN THE LOTTERY AND DECIDED T...","10 Guy | I WON $10,000,000 IN THE LOTTERY AND...",733523,305,107,/i/25p6ij,10-Guy,10-Guy.jpg
1,i.imgflip.com/kbovc.jpg,I NEED TO REFILL MY CHILD REPELLENT,10 Guy | I NEED TO REFILL MY CHILD REPELLENT |...,684545,12,0,/i/kbovc,10-Guy,10-Guy.jpg


In [50]:
embedder_params = {
        "device":CUDA_IF_AVAILABLE
}

cache_params = {
    "embedder_params": embedder_params,
    "db_name": MONGO_EMBEDDING_DB_NAME, 
    "collection_name": JINA_SBERT_EMBEDDING_TEMPLATE_TEXT_COLLECTION,
    "embedding_field_name":'emb',
    "megabatch_size":8192
}

In [51]:
flow_text_encode = Flow().add(uses=SBERTEncoderCache, name="SBERT_encoder", uses_with=cache_params)
flow_text_encode

In [52]:
with flow_text_encode:
    flow_text_encode.post(on='', inputs=text_input_docs(df), request_size=10_000, show_progress=True)

[32m⠏[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.48MB/s]


[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 10.1k/10.1k [00:00<00:00, 9.30MB/s]


[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 571/571 [00:00<00:00, 690kB/s]


[32m⠏[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 116/116 [00:00<00:00, 167kB/s]


[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 39.3k/39.3k [00:00<00:00, 23.8MB/s]


[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 349/349 [00:00<00:00, 390kB/s]


[32m⠏[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:   2%|▏         | 7.83M/438M [00:00<00:10, 41.0MB/s]

[32m⠹[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:   5%|▌         | 22.9M/438M [00:00<00:08, 48.3MB/s]

[32m⠼[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:   8%|▊         | 32.9M/438M [00:00<00:08, 49.4MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  11%|█         | 48.0M/438M [00:01<00:07, 50.0MB/s]

[32m⠏[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  13%|█▎        | 58.2M/438M [00:01<00:07, 50.4MB/s]

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  17%|█▋        | 73.3M/438M [00:01<00:07, 50.3MB/s]

[32m⠼[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  19%|█▉        | 83.4M/438M [00:01<00:07, 50.3MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  21%|██▏       | 93.4M/438M [00:01<00:06, 50.0MB/s]

[32m⠇[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  25%|██▍       | 109M/438M [00:02<00:06, 50.2MB/s] 

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  27%|██▋       | 119M/438M [00:02<00:06, 49.8MB/s]

[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  31%|███       | 134M/438M [00:02<00:06, 49.6MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  33%|███▎      | 144M/438M [00:02<00:06, 48.8MB/s]

[32m⠇[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  36%|███▌      | 158M/438M [00:03<00:05, 49.2MB/s]

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  38%|███▊      | 169M/438M [00:03<00:05, 49.8MB/s]

[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  42%|████▏     | 184M/438M [00:03<00:05, 49.8MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  44%|████▍     | 194M/438M [00:03<00:04, 49.3MB/s]

[32m⠇[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  48%|████▊     | 208M/438M [00:04<00:04, 49.5MB/s]

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  50%|████▉     | 218M/438M [00:04<00:04, 49.6MB/s]

[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  53%|█████▎    | 233M/438M [00:04<00:04, 48.7MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  56%|█████▌    | 243M/438M [00:04<00:03, 49.2MB/s]

[32m⠇[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  59%|█████▉    | 258M/438M [00:05<00:03, 49.9MB/s]

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  61%|██████▏   | 268M/438M [00:05<00:03, 49.2MB/s]

[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  65%|██████▍   | 283M/438M [00:05<00:03, 49.3MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  68%|██████▊   | 298M/438M [00:06<00:02, 50.0MB/s]

[32m⠏[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  70%|███████   | 309M/438M [00:06<00:02, 50.1MB/s]

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  73%|███████▎  | 319M/438M [00:06<00:02, 50.1MB/s]

[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  76%|███████▌  | 334M/438M [00:06<00:02, 49.4MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  80%|███████▉  | 349M/438M [00:07<00:01, 49.7MB/s]

[32m⠏[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  82%|████████▏ | 359M/438M [00:07<00:01, 49.9MB/s]

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  84%|████████▍ | 369M/438M [00:07<00:01, 50.1MB/s]

[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  88%|████████▊ | 384M/438M [00:07<00:01, 49.9MB/s]

[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  90%|████████▉ | 394M/438M [00:07<00:00, 49.8MB/s]

[32m⠇[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  93%|█████████▎| 409M/438M [00:08<00:00, 49.7MB/s]

[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  96%|█████████▌| 419M/438M [00:08<00:00, 49.3MB/s]

[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading:  98%|█████████▊| 429M/438M [00:08<00:00, 48.7MB/s]

[32m⠴[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 438M/438M [00:08<00:00, 49.3MB/s]


[32m⠏[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 63.0kB/s]


[32m⠸[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 239/239 [00:00<00:00, 320kB/s]


[32m⠦[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 466k/466k [00:00<00:00, 2.70MB/s]


[32m⠙[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 363/363 [00:00<00:00, 483kB/s]


[32m⠼[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 13.1k/13.1k [00:00<00:00, 11.9MB/s]


[32m⠧[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 232k/232k [00:00<00:00, 1.66MB/s]


[32m⠹[0m 1/2 waiting [33mSBERT_encoder[0m to be ready...                                        

Downloading: 100%|██████████| 190/190 [00:00<00:00, 243kB/s]


           Flow@15[I]:[32m🎉 Flow is ready to use![0m                                             
	🔗 Protocol: 		[1mGRPC[0m
	🏠 Local access:	[4m[36m0.0.0.0:38587[0m
	🔒 Private network:	[4m[36m172.20.0.6:38587[0m
	🌐 Public address:	[4m[36m35.224.116.253:38587[0m[0m
[32m⠦[0m Working... [32m━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:01[0m estimating... 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠇[0m Working... [32m━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:31[0m estimating... 

 50%|█████     | 1/2 [00:29<00:29, 29.48s/it]

[32m⠏[0m Working... [32m━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:38[0m estimating... 

100%|██████████| 2/2 [00:36<00:00, 18.34s/it]


[32m⠼[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:38[0m  0.0 step/s . 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠧[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:01:08[0m  0.0 step/s 

 50%|█████     | 1/2 [00:29<00:29, 29.62s/it]

[32m⠼[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:01:14[0m  0.0 step/s 

100%|██████████| 2/2 [00:35<00:00, 17.71s/it]


[32m⠏[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:01:14[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠇[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:01:40[0m  0.0 step/s 

 50%|█████     | 1/2 [00:25<00:25, 25.14s/it]

[32m⠋[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:01:45[0m  0.0 step/s 

100%|██████████| 2/2 [00:30<00:00, 15.19s/it]


[32m⠦[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:01:45[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠹[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:16[0m  0.0 step/s 

 50%|█████     | 1/2 [00:30<00:30, 30.91s/it]

[32m⠏[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:22[0m  0.0 step/s 

100%|██████████| 2/2 [00:36<00:00, 18.30s/it]


[32m⠧[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:23[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠧[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:56[0m  0.0 step/s 

 50%|█████     | 1/2 [00:33<00:33, 33.25s/it]

[32m⠏[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:05[0m  0.0 step/s 

100%|██████████| 2/2 [00:41<00:00, 20.80s/it]


[32m⠏[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:06[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠼[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:39[0m  0.0 step/s 

 50%|█████     | 1/2 [00:33<00:33, 33.86s/it]

[32m⠦[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:47[0m  0.0 step/s 

100%|██████████| 2/2 [00:41<00:00, 20.53s/it]


[32m⠇[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:48[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠴[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:04:17[0m  0.0 step/s 

 50%|█████     | 1/2 [00:29<00:29, 29.04s/it]

[32m⠇[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:04:23[0m  0.0 step/s 

100%|██████████| 2/2 [00:35<00:00, 17.71s/it]


[32m⠸[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:04:25[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠸[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:04:56[0m  0.0 step/s 

 50%|█████     | 1/2 [00:31<00:31, 31.37s/it]

[32m⠏[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:03[0m  0.0 step/s 

100%|██████████| 2/2 [00:38<00:00, 19.01s/it]


[32m⠴[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:04[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠋[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:31[0m  0.0 step/s 

 50%|█████     | 1/2 [00:26<00:26, 26.81s/it]

[32m⠸[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:41[0m  0.0 step/s 

100%|██████████| 2/2 [00:36<00:00, 18.12s/it]


[32m⠙[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:43[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠸[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:06:12[0m  0.0 step/s 

 50%|█████     | 1/2 [00:29<00:29, 29.48s/it]

[32m⠦[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:06:18[0m  0.0 step/s 

100%|██████████| 2/2 [00:35<00:00, 17.94s/it]


[32m⠦[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:06:20[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠹[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:06:52[0m  0.0 step/s 

 50%|█████     | 1/2 [00:31<00:31, 31.84s/it]

[32m⠙[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:06:59[0m  0.0 step/s 

100%|██████████| 2/2 [00:38<00:00, 19.42s/it]


[32m⠹[0m Working... [32m━━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:02[0m  0.0 step/s 

  0%|          | 0/1 [00:00<?, ?it/s]

[32m⠦[0m Working... [32m━━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:21[0m  0.0 step/s 

100%|██████████| 1/1 [00:19<00:00, 19.61s/it]


[32m⠧[0m       DONE [33m━━━━━━━━━━━━━[0m[33m╸[0m[2m[33m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:21[0m  0.0 step/s [K13 steps done in 7 minutes and 21 seconds
