In [None]:
from download import *

import pandas as pd



In [None]:
# read real db export and training data

training_data = pd.read_csv("posts.csv")
e6_dump_data = pd.read_csv("posts-2023-04-08.csv")

In [None]:
# inner join
subset_e6_dump_data = e6_dump_data[e6_dump_data["md5"].isin(training_data["md5"].tolist())]

In [None]:
# static e6 url
# file_ext
# md5
file_ext = "jpg"
md5 = "1234567890"
f'https://static1.e621.net/data/{md5[0:2]}/{md5[2:4]}/{md5}.{file_ext}'

In [None]:
subset_e6_dump_data.columns

In [None]:
# test retrieving image 
import os


dataset = subset_e6_dump_data.sample(10, random_state = 1)

def download(dataset:pd.DataFrame, save_path:str):
    
    
    for index, sample in dataset.iterrows():
        
        print(index)
        
        attempt = 0
        while attempt < 10:
            try:
                # generate url
                file_ext = sample["file_ext"]
                md5 = sample["md5"]
                url = f'https://static1.e621.net/data/{md5[0:2]}/{md5[2:4]}/{md5}.{file_ext}'

                user_agent_message = (
                    f"heya! this is lodestone from furry diffusion server, "
                    f"i need to rebuild my dataset for training, "
                    f"please let me know if this bot is pulling data to fast "
                    f"PS: OOPS sorry! too fast"
                )

                # download image as PIL object
                image = stream_image(url, user_agent=user_agent_message, threshold_size = 0)

                image = rescale_image(image, 1024)

                save_webp_without_alpha(
                    image, 
                    os.path.join(save_path, f"{md5}.webp"), 
                    quality=70
                )
                break
            except:
                print(f"failed downloading {index} ... retrying {attempt}/10")
                attempt += 1
        else:
            
            print(f"download attempt exceeded skipping {index}")
            continue 

# download(dataset)

In [None]:
import os
list_image = os.listdir("/home/user/project-fur/e6_dump/1024")
len(list_image)

In [None]:
list_image_pd = pd.DataFrame(list_image)
list_image_pd = list_image_pd.iloc[:,0].str.replace(".webp","")

In [None]:
list_image_pd.tolist()

In [None]:
subset_e6_dump_data = subset_e6_dump_data[~subset_e6_dump_data["md5"].isin(list_image_pd.tolist())]

In [None]:
subset_e6_dump_data

In [None]:
from threading import Thread

def multithread_download(df:pd.DataFrame, save_path:str, number_of_workers:int = 10) -> None:

    # function tp be executed as threads
    split_df = split_dataframe(df, number_of_workers)

    threads = []
    for df in split_df:
        thread_instance = Thread(target=download, args=[df, save_path])
        threads.append(thread_instance)

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
        


In [None]:
import multiprocessing

number_of_workers = 10
split_df = split_dataframe(subset_e6_dump_data, number_of_workers)
save_path = "/home/user/project-fur/e6_dump/1024"

args = [(df,) + (save_path,) + (10,) for df in split_df]

with multiprocessing.Pool(processes=number_of_workers) as pool:
    results = pool.starmap(multithread_download, args)
    print(results)


In [None]:
# constant variable
save_path = "/home/user/project-fur/e6_dump/1024"

In [None]:
from PIL import Image
import PIL
def check_error(filename:str) -> list:
    list_broken_image = []
    try:
        im = Image.open(filename)
        im.verify() 
        im.close()
        im = Image.open(filename) 
        im.transpose(PIL.Image.FLIP_LEFT_RIGHT)
        im.close()
    except Exception as e: 
        print(f"image error {filename}: {e}")
        list_broken_image.append(filename)
    return list_broken_image

In [None]:
import os
list_image = os.listdir("/home/user/project-fur/e6_dump/1024")
list_image = [os.path.join("/home/user/project-fur/e6_dump/1024", image) for image in list_image]

In [None]:
import multiprocessing

with multiprocessing.Pool(processes=80) as pool:
    results = pool.map(check_error, list_image)
    print(results)


In [None]:
flat_list = []
for sublist in results:
    for element in sublist:
        flat_list.append(element)

In [None]:
broken_image = [text.split("/")[-1] for text in flat_list]
broken_image = [md5.split(".")[0] for md5 in broken_image]
broken_image

In [None]:
remainder_data = subset_e6_dump_data[subset_e6_dump_data["md5"].isin(broken_image)]
download(remainder_data, save_path)

# perform comparison with old dataset

In [None]:
# read real db export and training data

training_data = pd.read_csv("posts.csv")
e6_dump_data = pd.read_csv("posts-2023-04-08.csv")

# inner join
subset_e6_dump_data = e6_dump_data[e6_dump_data["md5"].isin(training_data["md5"].tolist())]
training_data = training_data[training_data["md5"].isin(subset_e6_dump_data["md5"].tolist())]

In [None]:
new_data = subset_e6_dump_data.sort_values("md5").loc[:,["image_width","image_height"]]
new_data = new_data["image_width"].astype(str) + new_data["image_height"].astype(str)
new_data = new_data.reset_index(drop=True)
old_data = training_data.sort_values("md5").loc[:,["image_width","image_height"]]
old_data = old_data["image_width"].astype(str) + old_data["image_height"].astype(str)
old_data = old_data.reset_index(drop=True)

In [None]:
(new_data == old_data).value_counts()

In [None]:
print(len(new_data))
print(len(old_data))