## Checking available files in dataset

Adding metadata from image file

In [26]:
import os
from tqdm import tqdm
from PIL import Image
import pandas as pd
from typing import Tuple

In [27]:
def check_files(datapath:str, savepath:str=None)-> Tuple[pd.DataFrame,list]:
    """ Runs through the data folder and obtain metadata from the image
    Args:
        datapath (str): path to data folder
        savepath (str, optional): path to csv dataframe to save. Defaults to None.
    Returns:
        pd.DataFrame: dataframe of image 
        list: list of erroneous files
    """    
    for root, _, files in os.walk(datapath):
        path, mode, size, form, error_ls = [], [], [], [], []
        for file in files:
            filepath = os.path.join(root, file)
            if check_file_extension(filepath):
                img = Image.open(filepath)
                try:
                    path.append(filepath)
                    mode.append(img.mode)
                    size.append(img.size)
                    form.append(img.format)
                except:
                    error_ls.append(filepath)

    df = pd.DataFrame.from_dict({'file_path': path, 'img_size': size, 'img_mode':mode, 'img_format':form})

    if savepath: 
        df.to_csv(savepath, index=False)

    return df, error_ls

def check_file_extension(file:str)->bool:
    """ Check for file extension
    Returns:
        Bool: True if jpeg or jpg
    """
    fileext = os.path.splitext(file)[1]
    if fileext == ".jpeg" or fileext ==".jpg":
        return True
    return False


In [28]:
datapath = "../data" 
savepath = "dataset.csv"

df, error = check_files(datapath, savepath)
len(error)

0

## Creating

In [3]:
import pandas as pd

In [22]:
def get_folder(path:str)->str:
    folder = path.split('/')[3]
    return folder

def get_type(path:str)->str:
    type = path.split('/')[-1].split('_')[1]
    return type

def get_tts(path:str)->str:
    tts = path.split('/')[2]
    return tts

df['tts'] = df.file_path.apply(lambda x: get_tts(x))
df['folder'] = df.file_path.apply(lambda x: get_folder(x))
df['type'] = df.file_path.apply(lambda x: get_type(x))

In [24]:
savepath = "dataset.csv"
df.to_csv(savepath, index=False)