# Setup environment

In [None]:
import pandas as pd
import requests
import os
import zipfile
from tqdm import tqdm
import concurrent.futures

In [61]:
# !rm -r /kaggle/working/images-b3.zip

# Config

In [None]:
TEST_DATA = "../input/csc-hackathon-2023-lunua-task/test-data.csv"
TRAIN_DATA = "../input/csc-hackathon-2023-lunua-task/train.csv"
SUBM_DATA = "../input/csc-hackathon-2023-lunua-task/test-submission.csv"

# Dataloading Helpers

In [None]:
data = pd.read_csv(TRAIN_DATA)
test_df = pd.read_csv(TEST_DATA)

data.size

In [58]:
test_df.size

67980

In [59]:
image_urls = list(set(test_df['image_url1'].tolist() + test_df['image_url2'].tolist()))
len(image_urls)

43710

In [60]:
def download_image(url, output_dir):
    filename = url.split('/')[-1]
    output_path = os.path.join(output_dir, filename)
    response = requests.get(url, stream=True)

    with open(output_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)

def scrape_images(urls, num_threads):
    output_dir = "images"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for url in urls:
            futures.append(executor.submit(download_image, url, output_dir))

        # Use tqdm to display the progress
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), unit="image"):
            future.result()
            
num_threads = 20
scrape_images(image_urls, num_threads)

100%|██████████| 43710/43710 [19:22<00:00, 37.59image/s]


In [63]:
file_list = os.listdir('/kaggle/working/images')
len(file_list)

43710

In [None]:
directory = '/kaggle/working/images'
file_list = os.listdir(directory)
batch_size = len(file_list) // 3
batches = [file_list[i:i+batch_size] for i in range(0, len(file_list), batch_size)]

In [None]:
len(batches[0])

In [64]:
zip_filename = 'images-test.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for filename in tqdm(file_list):
        file_path = os.path.join(output_dir, filename)
        zipf.write(file_path, os.path.relpath(file_path, output_dir))

print("Images zipped successfully!")

100%|██████████| 43710/43710 [01:04<00:00, 679.20it/s] 


Images zipped successfully!
