In [4]:
import sys
sys.path.append('/app/')  # noqa

from jina import Document, DocumentArray, Flow, Executor, requests
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import argparse
from memlish.executors.cache import RealSBERTEncoderCache
from memlish.executors.index import FaissIndexer
import torch
import hashlib

def my_hash(s): 
    return str(int(hashlib.md5(str(s).encode('utf-8')).hexdigest(), 16))

In [5]:
torch.multiprocessing.set_start_method('spawn', force=True) 

In [10]:
OUT_DIR = Path("/data/imgflip/")
OUT_TEMPLATE_TEXTS_CSV = OUT_DIR/"template_texts.csv"

In [11]:
JINA_SBERT_EMBEDDING_TEMPLATE_TEXT_COLLECTION = "01_sbert_all_mpnet_base_v2_imgflip_template_100k_embeddings"
MONGO_EMBEDDING_DB_NAME = 'memlish_db'

In [12]:
def text_input_docs(df):    
    for idx, row in df.iterrows():
        template_name = row["template_img_name"]
        try:
            template_text = row["text"]
            document_id = f"{template_name}_{my_hash(template_text)}"
            doc = Document(id=document_id, text=template_text)# tags={'filename': str(template_img_path)})
        except Exception as e:
            print(document_id)
            print(template_text)
            print(str(e))
        
        yield doc 

In [13]:
templates_df = pd.read_csv(OUT_TEMPLATE_TEXTS_CSV)

In [14]:
templates_df = templates_df[~templates_df['text'].isna()]

In [15]:
templates_df = templates_df[~templates_df.duplicated(["text", "template_img_name"])]

In [16]:
templates_df.head(2)

Unnamed: 0,url,text,alt,views,upvotes,comments,meme_home_page,template_name,template_img_name
0,i.imgflip.com/61ov27.jpg,accepting that ur child made a good point; MOM...,Moms be like | accepting that ur child made a...,7208,106,7,/i/61ov27,Drake-Hotline-Bling,Drake-Hotline-Bling.jpg
1,i.imgflip.com/61soau.jpg,Ground children by sending them to bed early a...,Mexican meme | Ground children by sending the...,5271,65,5,/i/61soau,Drake-Hotline-Bling,Drake-Hotline-Bling.jpg


In [17]:
k_to_take = 100
groups = []

for _, group in templates_df.groupby(['template_img_name']):
    selected_sorted = group.sort_values(by=['views'], ascending=False)[:k_to_take]
    groups.append(selected_sorted)

In [18]:
df = pd.concat(groups).reset_index(drop=True)
df.head(2)

Unnamed: 0,url,text,alt,views,upvotes,comments,meme_home_page,template_name,template_img_name
0,i.imgflip.com/25p6ij.jpg,"I WON $10,000,000 IN THE LOTTERY AND DECIDED T...","10 Guy | I WON $10,000,000 IN THE LOTTERY AND...",733523,305,107,/i/25p6ij,10-Guy,10-Guy.jpg
1,i.imgflip.com/kbovc.jpg,I NEED TO REFILL MY CHILD REPELLENT,10 Guy | I NEED TO REFILL MY CHILD REPELLENT |...,684545,12,0,/i/kbovc,10-Guy,10-Guy.jpg


In [19]:
embedder_params = {
        "device": 'cpu'
}

cache_params = {
    "embedder_params": embedder_params,
    "db_name": MONGO_EMBEDDING_DB_NAME, 
    "collection_name": JINA_SBERT_EMBEDDING_TEMPLATE_TEXT_COLLECTION,
    "embedding_field_name":'emb',
    "megabatch_size":8192
}

In [21]:
flow_text_encode = Flow().add(uses=RealSBERTEncoderCache, name="SBERT_encoder", uses_with=cache_params)
flow_text_encode

In [23]:
with flow_text_encode:
    flow_text_encode.post(on='', inputs=text_input_docs(df), request_size=10_000, show_progress=True)

           Flow@18[I]:[32m🎉 Flow is ready to use![0m                                             
	🔗 Protocol: 		[1mGRPC[0m
	🏠 Local access:	[4m[36m0.0.0.0:57901[0m
	🔒 Private network:	[4m[36m172.18.0.2:57901[0m
	🌐 Public address:	[4m[36m63.35.187.56:57901[0m[0m
[32m⠧[0m Working... [32m━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:01[0m estimating... 

  0%|          | 0/1 [00:00<?, ?it/s]

[32m⠸[0m Working... [32m━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:30[0m estimating... {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 15:58:44.382928', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 29.276824474334717}
[32m⠦[0m Working... [32m━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:30[0m estimating... 

100%|██████████| 1/1 [00:29<00:00, 29.35s/it]


[32m⠇[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:00:31[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠸[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:05[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:00:19.406408', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 94.70445322990417}
[32m⠇[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:06[0m  0.0 step/s 

 50%|█████     | 1/2 [01:35<01:35, 95.03s/it]

[32m⠼[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:22[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:00:36.659554', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 16.92940068244934}
[32m⠦[0m Working... [32m━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:23[0m  0.0 step/s 

100%|██████████| 2/2 [01:52<00:00, 56.02s/it]


[32m⠋[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:02:23[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠇[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:36[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:01:49.970406', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 72.84306597709656}
[32m⠼[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:36[0m  0.0 step/s 

 50%|█████     | 1/2 [01:13<01:13, 73.17s/it]

[32m⠏[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:50[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:02:04.137680', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 13.846692085266113}
[32m⠙[0m Working... [32m━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:50[0m  0.0 step/s 

100%|██████████| 2/2 [01:27<00:00, 43.54s/it]


[32m⠦[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:03:51[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠙[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:16[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:03:30.345038', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 85.61653590202332}
[32m⠦[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:16[0m  0.0 step/s 

 50%|█████     | 1/2 [01:25<01:25, 86.00s/it]

[32m⠸[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:31[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:03:45.721874', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 14.99766492843628}
[32m⠦[0m Working... [32m━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:32[0m  0.0 step/s 

100%|██████████| 2/2 [01:41<00:00, 50.54s/it]


[32m⠹[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:05:32[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠦[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:07[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:05:21.085168', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 94.64491438865662}
[32m⠙[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:07[0m  0.0 step/s 

 50%|█████     | 1/2 [01:34<01:34, 94.97s/it]

[32m⠼[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:31[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:05:45.156703', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 23.745384693145752}
[32m⠧[0m Working... [32m━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:31[0m  0.0 step/s 

100%|██████████| 2/2 [01:58<00:00, 59.40s/it]


[32m⠼[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:07:32[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠇[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:09:02[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:07:16.569986', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 90.54138851165771}
[32m⠼[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:09:03[0m  0.0 step/s 

 50%|█████     | 1/2 [01:30<01:30, 90.94s/it]

[32m⠋[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:09:22[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:07:35.968313', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 19.002976655960083}
[32m⠸[0m Working... [32m━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:09:22[0m  0.0 step/s 

100%|██████████| 2/2 [01:50<00:00, 55.01s/it]


[32m⠙[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:09:23[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠧[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:10:35[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:08:49.531944', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 72.549640417099}
[32m⠹[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:10:36[0m  0.0 step/s 

 50%|█████     | 1/2 [01:12<01:12, 72.87s/it]

[32m⠇[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:10:51[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:09:05.782152', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 15.935953617095947}
[32m⠙[0m Working... [32m━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:10:52[0m  0.0 step/s 

100%|██████████| 2/2 [01:28<00:00, 44.44s/it]


[32m⠋[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:10:53[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠙[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:12:19[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:10:33.089324', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 86.2028796672821}
[32m⠦[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:12:19[0m  0.0 step/s 

 50%|█████     | 1/2 [01:26<01:26, 86.53s/it]

[32m⠙[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:12:36[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:10:50.265194', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 16.846456289291382}
[32m⠼[0m Working... [32m━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:12:36[0m  0.0 step/s 

100%|██████████| 2/2 [01:43<00:00, 51.73s/it]


[32m⠴[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:12:37[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠴[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:13:43[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:11:57.404657', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 65.8918297290802}
[32m⠙[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:13:44[0m  0.0 step/s 

 50%|█████     | 1/2 [01:06<01:06, 66.28s/it]

[32m⠧[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:14:14[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:12:27.943025', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 30.152880430221558}
[32m⠋[0m Working... [32m━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:14:14[0m  0.0 step/s 

100%|██████████| 2/2 [01:36<00:00, 48.25s/it]


[32m⠹[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:14:15[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠦[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:15:31[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:13:45.736578', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 76.37919330596924}
[32m⠙[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:15:32[0m  0.0 step/s 

 50%|█████     | 1/2 [01:16<01:16, 76.71s/it]

[32m⠋[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:15:48[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:14:02.275316', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 16.214709758758545}
[32m⠹[0m Working... [32m━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:15:48[0m  0.0 step/s 

100%|██████████| 2/2 [01:32<00:00, 46.50s/it]


[32m⠦[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:15:50[0m  0.0 step/s 

  0%|          | 0/2 [00:00<?, ?it/s]

[32m⠼[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:17:17[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:15:31.638523', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 87.86424899101257}
[32m⠏[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:17:18[0m  0.0 step/s 

 50%|█████     | 1/2 [01:28<01:28, 88.18s/it]

[32m⠧[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:17:36[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:15:50.117226', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 18.163864374160767}
[32m⠏[0m Working... [32m━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:17:36[0m  0.0 step/s 

100%|██████████| 2/2 [01:46<00:00, 53.21s/it]


[32m⠸[0m Working... [32m━━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:17:37[0m  0.0 step/s 

  0%|          | 0/1 [00:00<?, ?it/s]

[32m⠋[0m Working... [32m━━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:18:37[0m  0.0 step/s {'es_tag': 'TIMELOG', 'timestamp': '2022-02-12 16:16:51.220717', 'function': 'memlish.executors.bert.get_embeddings', 'duration': 59.51326775550842}
[32m⠼[0m Working... [32m━━━━━━━━━━━━[0m[32m╸[0m[2m[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:18:37[0m  0.0 step/s 

100%|██████████| 1/1 [00:59<00:00, 59.70s/it]


[32m⠧[0m       DONE [33m━━━━━━━━━━━━━[0m[33m╸[0m[2m[33m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [36m0:18:38[0m  0.0 step/s [K13 steps done in 18 minutes and 37 seconds
