In [None]:
pip install -U "huggingface_hub[cli]"

In [None]:
# Installing required libraries 
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q datasets

In [1]:
# Getting the required libraries 
import json 
from pathlib import Path
import os
import shutil
import pandas as pd
import random 

### Preprocessing the Dataset for finetuning 

### Images

if below dataset structure is required. Use the code snippet below.

In [2]:
def create_dataset_structure(source_folder, dest_folder, split_percent):
    """
    Create File Structure for the Dataset (huggingface)

    Parameters:
        - source_folder (str): Path to folder containing images.
        - dest_folder (str): Path to the destination folder for the dataset structure.
        - split_percent (str): Percentage of images to be used for training.

    output: Appropriately makes the file structure according to huggingface 
            my_dataset_repository
                            ├── folder/train/image1.png
                            ├── folder/train/image2.png
                            ├── folder/train/image3.png
                            ├── folder/test/image4.png
                            ├── folder/test/metadata.jsonl
                            └── folder/train/metadata.jsonl
    """
    ## creating testing and training folders in the destination folder
    train_folder = os.path.join(dest_folder, 'train')
    test_folder = os.path.join(dest_folder, 'test')

    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok = True)

    # getting the list of images in source folder 
    image_files = [f for f in os.listdir(source_folder) if f.endswith(('.jpg', '.jpeg','.png', '.gif'))]

    # Calculate the number of images for train and test based on split 
    num_train = int(len(image_files)* split_percent)
    num_test =  int(len(image_files) * (1 - split_percent))

    # randomly shuffling images
    random.shuffle(image_files)

    # copying images to train folder 
    for image_file in image_files[:num_train]:
        source_path = os.path.join(source_folder, image_file)
        dest_path = os.path.join(train_folder, image_file)
        shutil.copy(source_path, dest_path)

    # copying images to test folder
    for image_file in image_files[num_train:]:
        source_path = os.path.join(source_folder, image_file)
        dest_path = os.path.join(test_folder, image_file)
        shutil.copy(source_path, dest_path)
    

In [3]:
source_folder = r"C:\Users\smeet\Desktop\data science capstone\images_and_captions\Images"
dest_folder = r"C:\Users\smeet\Desktop\data science capstone\images_and_captions\Image_Split"
split_percent = 0.80
create_dataset_structure(source_folder, dest_folder, split_percent )

In [4]:
#### checking if there are images lost while doing preprocessing
print("Number of images in source folder: ", len(os.listdir(source_folder)))
print("Number of images in destination train folder: ", len(os.path.join(dest_folder, 'train')))
print("Number of images in destination test folder: ", len(os.path.join(dest_folder, 'test')))

7195

Captions

In [5]:
def hugging_face_model_format(source_folder, dest_folder):
    """ 
    input (str) : Path to captions (processed) json file
    source_folder: This is where our processed captions will be stored
    dest_folder: This is where our images (train/test) are saved. 

    output: Creates a output JSON File that is required by for the huggingface input model 
           eg :  [{ "file_name" : "image.name" , "text" : "description of the text"} , ...
    """
    
    ## getting the processed captions 
    with open(source_folder , 'r') as f:
        data = json.load(f)
    
    ## this model requires a  list of captions where each image is in {} format shown above in docstring
    random_captions = []
    for k , (image,caption) in data.items():
        random_captions.append({"file_name" : str(image) , "text": str(caption)})
    
    ## Each image has 5 captions each --> so we randomly drop the duplicate captions 
    df = pd.DataFrame(random_captions)
    df_unique = df.sample(frac=1).drop_duplicates(subset='file_name').reset_index(drop=True)

    random_captions = df_unique.to_dict(orient='records')
    
    # The images folder need a metadata.jsonl file to associate images with captions. [specific hugginface model requirement]
    with open(os.path.join(dest_folder, 'metadata.jsonl'), 'w') as f:
        for item in random_captions:
            f.write(json.dumps(item) + '\n')
    return random_captions

# 
source_folder  = r"C:\Users\smeet\Desktop\data science capstone\data_structure\data_captions_processed.json"
dest_folder = r"C:\Users\smeet\Desktop\data science capstone\images_and_captions\Image_Split"
captions = hugging_face_model_format(source_folder, dest_folder)

physically moved the metadata.jsonl files into the folders 

In [8]:
# checking if still our output has same number of captions.
with open(source_folder, 'r') as f:
    data = json.load(f)

len(data)
len(captions)*5

40455