# Flood Fill Indonesia Marketplace Kitchenware with Images

### Preparation

In [212]:
!pip install jellyfish



In [213]:
import jellyfish
import pandas as pd
from difflib import SequenceMatcher
import math

In [214]:
df = pd.read_csv('indonesia_marketplace_kitchenware.csv')
pd.isna(df['image'].values).sum()

19065

In [215]:
kitchenwares = df.to_dict(orient='records')
# kitchenwares[:10]

### Flood-fill

The objective here is to fill the products that has no image with image that exist on products with image.

The way we do this is by separating the products with image and the products without image. Each product without image will be calculated on how similar the title is with the one with image. The most similar product title with image will then be taken as reference, and fill the product without image with their image.

We do it to all products, hence called; flood fill

In [216]:
# separate data
kitchenwares_with_image = []
kitchenwares_without_image = []
for kitchenware in kitchenwares:
  if pd.isna(kitchenware['image']):
    temp = kitchenware
    temp['similarity'] = 0.0
    kitchenwares_without_image.append(temp)
  else:
    temp = kitchenware
    temp['similarity'] = 1.0
    kitchenwares_with_image.append(temp)

print(len(kitchenwares_with_image))
print(len(kitchenwares_without_image))

560
19065


In [217]:
# flood fill
for kitchenware_without_image in kitchenwares_without_image:
  for kitchenware_with_image in kitchenwares_with_image:
    if kitchenware_without_image['category'] != kitchenware_with_image['category']:
      continue
    similarity_score = jellyfish.jaro_similarity(kitchenware_without_image['title'], kitchenware_with_image['title'])
    if similarity_score > kitchenware_without_image['similarity']:
      kitchenware_without_image['image'] = kitchenware_with_image['image']
      kitchenware_without_image['similarity'] = similarity_score

In [218]:
# combine
kitchenwares_combined = []
for kitchenware in kitchenwares_without_image:
  kitchenwares_combined.append(kitchenware)
for kitchenware in kitchenwares_with_image:
  kitchenwares_combined.append(kitchenware)

In [222]:
# clean and transform to table
new_df = pd.DataFrame.from_records(kitchenwares_combined)
new_df = new_df.drop(['similarity'], axis=1)
pd.isna(new_df['image'].values).sum()

0

In [224]:
#export to csv
new_df.to_csv('indonesia_marketplace_kitchenware_flood_fill.csv', index=False)