Note: images was cleaned by LabelStudio anntotation

In [1]:
import sys
sys.path.append('/app')

In [2]:
import pandas as pd
from pathlib import Path
import shutil
import memlish
from memlish.io.image import load_image

In [4]:
IMGFLIP_DIR = Path('/data/imgflip/') 
IMAGE_DIR = IMGFLIP_DIR / 'scrap_language_image_pairs_20220209/images'
V1_IMAGE_DIR = IMGFLIP_DIR / 'v1/images'

V1_IMAGE_DIR.mkdir(parents=True, exist_ok=True)

## Prepare v1/images from cleaned by LabelStudio images

In [5]:
export_df = pd.read_csv('https://s3.eu-west-1.amazonaws.com/memlish.head.public/LabelStudio/memlish/v1_1800_cleaned_templates.csv')

In [6]:
approved_mask = export_df['choice'].isna()

approved_templates_ds = export_df[approved_mask].copy()

approved_templates_ds.head(2)

Unnamed: 0,image,id,choice,annotator,annotation_id,created_at,updated_at,lead_time
3,/data/upload/1/c28c65bf-Zorg.jpg,1843,,1,1843,2022-02-19T22:24:02.204753Z,2022-02-19T22:24:02.204777Z,8.966
4,/data/upload/1/0a80358e-Zombie-Overly-Attached...,1842,,1,1842,2022-02-19T22:23:52.698633Z,2022-02-19T22:23:52.698654Z,1.014


In [7]:
approved_templates_ds.shape

(1184, 8)

In [8]:
# LabelStudio insert uniq id to start of filename, I've removed it
labeled_images = approved_templates_ds['image'].apply(lambda x: '-'.join(x.split('-')[1:])).values

In [9]:
approved_images = [IMAGE_DIR / i for i in labeled_images]

In [10]:
approved_images[:2]

[PosixPath('/data/imgflip/scrap_language_image_pairs_20220209/images/Zorg.jpg'),
 PosixPath('/data/imgflip/scrap_language_image_pairs_20220209/images/Zombie-Overly-Attached-Girlfriend.jpg')]

In [11]:
for i in approved_images:
    shutil.copyfile(i, V1_IMAGE_DIR / i.name)

len(V1_IMAGE_DIR.ls())

1184

## Merge approved images with texts csv

In [12]:
texts_df = pd.read_csv(IMGFLIP_DIR / 'scrap_language_image_pairs_20220209/template_texts.csv')

In [13]:
image_names = [i.name for i in V1_IMAGE_DIR.ls()]

In [14]:
approved_text_mask = texts_df['template_img_name'].apply(lambda x: x in image_names)

In [15]:
df = texts_df[approved_text_mask].copy()

In [62]:
df.head()

Unnamed: 0,url,text,alt,views,upvotes,comments,meme_home_page,template_name,template_img_name
59264,i.imgflip.com/61wlf5.jpg,ME; THE MOBILE GAME I BOUGHT 2 MINUTES AGO; TH...,Is this not true? | ME; THE MOBILE GAME I BOU...,704,6,0,/i/61wlf5,Distracted-Boyfriend,Distracted-Boyfriend.jpg
59265,i.imgflip.com/61zv6o.jpg,NEWS; GOOD THINGS IN THE WORLD; BAD THINGS IN ...,Distracted Boyfriend | NEWS; GOOD THINGS IN T...,945,6,0,/i/61zv6o,Distracted-Boyfriend,Distracted-Boyfriend.jpg
59266,i.imgflip.com/6256vy.jpg,ALSO ME; ALSO ME; ME,Distracted Boyfriend | ALSO ME; ALSO ME; ME |...,44,3,3,/i/6256vy,Distracted-Boyfriend,Distracted-Boyfriend.jpg
59267,i.imgflip.com/61z0ba.jpg,STUDY; ME; MOBILE,Distracted Boyfriend | STUDY; ME; MOBILE | im...,443,3,0,/i/61z0ba,Distracted-Boyfriend,Distracted-Boyfriend.jpg
59268,i.imgflip.com/624obq.jpg,Fortnite; Fortnite youtubers when the game is ...,Distracted Boyfriend | Fortnite; Fortnite you...,110,2,0,/i/624obq,Distracted-Boyfriend,Distracted-Boyfriend.jpg


## Deduplication

In [17]:
# install bo4ka
!pip install git+https://<token>@github.com/memlish/bo4ka.git --no-cache-dir

Collecting git+https://****@github.com/memlish/bo4ka.git
  Cloning https://****@github.com/memlish/bo4ka.git to /tmp/pip-req-build-3l20gn07
  Running command git clone -q 'https://****@github.com/memlish/bo4ka.git' /tmp/pip-req-build-3l20gn07
Collecting opencv-python
  Downloading opencv_python-4.5.5.62-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (60.4 MB)
[K     |████████████████████████████████| 60.4 MB 4.9 MB/s eta 0:00:01
Collecting matplotlib==3.5.0
  Downloading matplotlib-3.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 28.0 MB/s eta 0:00:01
[?25hCollecting seaborn==0.11.2
  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
[K     |████████████████████████████████| 292 kB 73.2 MB/s eta 0:00:01
Collecting voila==0.2.16
  Downloading voila-0.2.16-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 88.3 MB/s eta 0:00:01
[?25hCollecting sklearn
  Downloading sklearn-0.

In [93]:
import bo4ka
from bo4ka.embeddings.generators import WangHelsingEmbeddingGenerator, WangHelsingConfig
from bo4ka.matching.pipelines.p00_wanghelsing_loftr import Pipeline_00_WangHelsing_LoFTR
from bo4ka.nn.loftr import LoFTR

from bo4ka.deduplication.imagededup_utils import create_cluster_assignment_map, get_cluster2nodes

In [19]:
cfg = WangHelsingConfig(batch_size=16, num_workers=0, device='cuda')

wang_helsing = WangHelsingEmbeddingGenerator(V1_IMAGE_DIR, cfg=cfg)

Dowloading /bo4ka_home/wang_helsing_descriptor_baseline.pth.tar since not in cache..
Done


In [20]:
image_files = [i.name for i in V1_IMAGE_DIR.ls()]

In [21]:
embs_filenames, embs = wang_helsing.get_embeddings(image_files)

Calculate descriptors:   0%|          | 0/2 [00:00<?, ?it/s]

imsize=768:   0%|          | 0/74 [00:00<?, ?it/s]

imsize=128:   0%|          | 0/74 [00:00<?, ?it/s]

In [22]:
embs_filenames = [V1_IMAGE_DIR/f for f in embs_filenames]
matcher = LoFTR(device='cuda')

pipeline = Pipeline_00_WangHelsing_LoFTR(
    template_files=embs_filenames,
    template_desc=embs,
    embedding_generator=wang_helsing,
    loftr=matcher)

Downloading: "http://cmp.felk.cvut.cz/~mishkdmy/models/loftr_outdoor.ckpt" to /root/.cache/torch/hub/checkpoints/loftr_outdoor.ckpt


  0%|          | 0.00/44.2M [00:00<?, ?B/s]

In [23]:
matches = pipeline.process_queries(embs_filenames, top_k=50)

Calculate descriptors:   0%|          | 0/2 [00:00<?, ?it/s]

imsize=768:   0%|          | 0/74 [00:00<?, ?it/s]

imsize=128:   0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/1850 [00:00<?, ?it/s]

In [108]:
matches_df = pd.DataFrame(matches)

In [109]:
dublicates = {}
for query_i, rows in matches_df[matches_df.is_exact_match].groupby('query_i'):
    template_img_name = Path(rows["query_file"].values[0]).name
    dublicates[template_img_name] = [Path(i).name for i in rows['template_file'].tolist()]

In [110]:
node2cluster = create_cluster_assignment_map(dublicates)

In [111]:
df['cluster'] = df['template_img_name'].map(lambda x: node2cluster[x])

In [122]:
deduplicated_dfs = []

cols2take = ['text', 'alt', 'views', 'upvotes', 'comments', 'template_name', 'template_img_name']

for cluster, rows in df.groupby('cluster'):
    rows['template_name'] = rows.iloc[0]['template_name']    
    rows['template_img_name'] = rows.iloc[0]['template_img_name']
    deduplicated_dfs.append(rows[cols2take])

In [123]:
deduplicated_df = pd.concat(deduplicated_dfs).reset_index(drop=True)

In [126]:
deduplicated_df

Unnamed: 0,text,alt,views,upvotes,comments,template_name,template_img_name
0,WHERE IS THE CHOCKY MILK; THAT I SAVED LAST NI...,Chainsaw Bear | WHERE IS THE CHOCKY MILK; THA...,72,3,2,Chainsaw-Bear,Chainsaw-Bear.jpg
1,POV: YOU SAID POST OR VOID INSTEAD OF POINT OF...,Run | POV: YOU SAID POST OR VOID INSTEAD OF P...,69,2,0,Chainsaw-Bear,Chainsaw-Bear.jpg
2,PERSON: CLIMBS A TREE TO GET AWAY; BEAR:,Chainsaw Bear | PERSON: CLIMBS A TREE TO GET ...,22461,381,35,Chainsaw-Bear,Chainsaw-Bear.jpg
3,2021; READY AND WAITING,2021 | 2021; READY AND WAITING | image tagged...,6948,164,4,Chainsaw-Bear,Chainsaw-Bear.jpg
4,ELECTRICAL BE LIKE:,true | ELECTRICAL BE LIKE: | image tagged in ...,8029,197,2,Chainsaw-Bear,Chainsaw-Bear.jpg
...,...,...,...,...,...,...,...
972337,DANICA; DAWSON; TYLER,Distracted girlfriend | DANICA; DAWSON; TYLER...,287,0,0,Distracted-girlfriend,1yicsm.jpg
972338,LAS VEGAS GOLDEN KNIGHTS; STANLEY \nCUP; LIGHT...,Distracted girlfriend | LAS VEGAS GOLDEN KNIG...,812,0,0,Distracted-girlfriend,1yicsm.jpg
972339,Any Other Excuse To Get Off Facetime; Gotta Go...,Yeetus Feetus I Wanna Commit Self Deletus | A...,383,0,0,Distracted-girlfriend,1yicsm.jpg
972340,FAITHLESS SINFUL HUMANS IN \nHIS CHURCH; MODER...,Distracted girlfriend | FAITHLESS SINFUL HUMA...,257,0,0,Distracted-girlfriend,1yicsm.jpg


In [129]:
deduplicated_df.to_csv(IMGFLIP_DIR/'v1/template_texts.csv')

## Delete dublicate images

In [140]:
deduplicated_images = deduplicated_df['template_img_name'].unique()

imgs_to_delete = [i for i in V1_IMAGE_DIR.ls() if i.name not in deduplicated_images]

for i in imgs_to_delete:
    i.unlink()