## Overview

This script will move downloaded images to `prepare_dataset`. The process is briefed as follows:

- Images downloaded from `notebooks/download.ipynb` will be stored in `download_file` folder.
- Under `download_file` folder, images will be grouped according to the polygon they belong to.
- In each polygon, 2 types of images are present: geotiff with band information, and a mask file.
- Geotiffs should be moved and stored in `prepare_dataset/images_directory{group_id}` folder.
- Mask files are stored in `prepare_dataset/mask_directory{group_id}` folder.

## Implementation

In [2]:
import os
import glob
import shutil
import pandas as pd


### Get all subfolders

In [3]:
def fast_scandir(dirname: str) -> list:
    """
    Scan and return all subfolders of a directory.
    """
    subfolders= [f.path for f in os.scandir(dirname) if f.is_dir()]
    for dirname in list(subfolders):
        subfolders.extend(fast_scandir(dirname))
    return subfolders

source_path = "../download_file/"
subfolders_list = fast_scandir(source_path)
print(f"Number of subfolders: {len(subfolders_list)}")

for folder in subfolders_list:
    print(folder)


Number of subfolders: 1327
../download_file/S2B_10UFC_20230828_0_L2A
../download_file/S2A_10UEA_20230813_0_L2A
../download_file/S2A_10UFV_20230813_0_L2A
../download_file/S2B_10UGA_20230815_0_L2A
../download_file/S2A_10UEC_20230816_0_L2A
../download_file/S2A_10UEB_20230816_0_L2A
../download_file/S2B_10UFB_20230828_0_L2A
../download_file/S2B_11ULU_20230828_0_L2A
../download_file/S2A_11ULT_20230820_0_L2A
../download_file/S2B_11ULR_20230815_0_L2A
../download_file/S2B_10UGB_20230828_0_L2A
../download_file/S2B_10UGC_20230828_0_L2A
../download_file/S2B_10UGD_20230828_0_L2A
../download_file/S2B_10UEC_20230828_0_L2A
../download_file/S2A_10UFA_20230813_0_L2A
../download_file/S2B_10UFV_20230815_0_L2A
../download_file/S2B_10UEA_20230815_0_L2A
../download_file/S2B_10UFC_20230828_0_L2A/20230828
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/6_12
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/13_11
../download_file/S2B_10UFC

### Get folders with restrictions - Regex

In [4]:
def get_subfolders_with_keyword(keyword: str, subfolders_list: list = subfolders_list) -> list:
    subfolders_with_keyword_list = []

    for folder in subfolders_list:
        if keyword in folder:
            subfolders_with_keyword_list.append(folder)

    return subfolders_with_keyword_list

subfolders_with_keyword_list = get_subfolders_with_keyword("tiles/") # note that we need the / to get folders
for folder in subfolders_with_keyword_list:
    print(folder)


../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/6_12
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/13_11
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/1_9
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/2_13
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_5
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/10_8
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/2_12
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/13_10
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/6_13
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_3
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_4
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/10_9
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/3_12
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_10
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/7_13
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/7_12
../dow

### Get folders with restrictions - DataFrame

#### 1. Turn paths into columns

In [7]:
raw_df = pd.DataFrame(subfolders_list, columns=["path_name"])
raw_df.head()


Unnamed: 0,path_name
0,../download_file/S2B_10UFC_20230828_0_L2A
1,../download_file/S2A_10UEA_20230813_0_L2A
2,../download_file/S2A_10UFV_20230813_0_L2A
3,../download_file/S2B_10UGA_20230815_0_L2A
4,../download_file/S2A_10UEC_20230816_0_L2A


In [8]:
test_df = pd.DataFrame([x.rsplit('/') for x in raw_df['path_name']])
test_df


Unnamed: 0,0,1,2,3,4,5
0,..,download_file,S2B_10UFC_20230828_0_L2A,,,
1,..,download_file,S2A_10UEA_20230813_0_L2A,,,
2,..,download_file,S2A_10UFV_20230813_0_L2A,,,
3,..,download_file,S2B_10UGA_20230815_0_L2A,,,
4,..,download_file,S2A_10UEC_20230816_0_L2A,,,
...,...,...,...,...,...,...
1322,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,4_7
1323,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,5_12
1324,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,1_13
1325,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,1_12


#### 2. Filter out rows that have `tile` as part of the path names

In [10]:
df_subfolder_under_tiles = test_df[((test_df[4] == "tiles") & (test_df[5].notnull()))].reset_index(drop=True)
df_subfolder_under_tiles


Unnamed: 0,0,1,2,3,4,5
0,..,download_file,S2B_10UFC_20230828_0_L2A,20230828,tiles,6_12
1,..,download_file,S2B_10UFC_20230828_0_L2A,20230828,tiles,13_11
2,..,download_file,S2B_10UFC_20230828_0_L2A,20230828,tiles,1_9
3,..,download_file,S2B_10UFC_20230828_0_L2A,20230828,tiles,2_13
4,..,download_file,S2B_10UFC_20230828_0_L2A,20230828,tiles,12_5
...,...,...,...,...,...,...
1271,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,4_7
1272,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,5_12
1273,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,1_13
1274,..,download_file,S2B_10UEA_20230815_0_L2A,20230815,tiles,1_12


#### 3. Reconcatenate to pathnames and indicate date

In [11]:
df_selected_paths_date = df_subfolder_under_tiles
df_selected_paths_date["file_path"] = df_selected_paths_date[[0, 1, 2, 3, 4, 5]].apply(lambda x: "/".join(x), axis=1)
df_selected_paths_date["date"] = df_selected_paths_date[3].apply(lambda x: pd.to_datetime(x, format="%Y%m%d"))
df_selected_paths_date.drop([0, 1, 2, 3, 4, 5], axis=1, inplace=True)
df_selected_paths_date


Unnamed: 0,file_path,date
0,../download_file/S2B_10UFC_20230828_0_L2A/2023...,2023-08-28
1,../download_file/S2B_10UFC_20230828_0_L2A/2023...,2023-08-28
2,../download_file/S2B_10UFC_20230828_0_L2A/2023...,2023-08-28
3,../download_file/S2B_10UFC_20230828_0_L2A/2023...,2023-08-28
4,../download_file/S2B_10UFC_20230828_0_L2A/2023...,2023-08-28
...,...,...
1271,../download_file/S2B_10UEA_20230815_0_L2A/2023...,2023-08-15
1272,../download_file/S2B_10UEA_20230815_0_L2A/2023...,2023-08-15
1273,../download_file/S2B_10UEA_20230815_0_L2A/2023...,2023-08-15
1274,../download_file/S2B_10UEA_20230815_0_L2A/2023...,2023-08-15


#### Save and copy selected file paths to a list

In [12]:
selected_path_list = df_selected_paths_date["file_path"].values.tolist()

for path in selected_path_list:
    print(path)


../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/6_12
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/13_11
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/1_9
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/2_13
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_5
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/10_8
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/2_12
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/13_10
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/6_13
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_3
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_4
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/10_9
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/3_12
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/12_10
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/7_13
../download_file/S2B_10UFC_20230828_0_L2A/20230828/tiles/7_12
../dow

### Given a list of directories, get a list of files inside, and move to the selected directory

For now, I will use the shorter list - `subfolders_with_keyword_list`.

In [13]:
def get_list_of_files_in_directory(directory_name: str, keyword: str = ".tif") -> list:
    return [f"{directory_name}/{f}" for f in os.listdir(directory_name) if f.endswith(keyword)]

test_get_list_of_files_in_directory = get_list_of_files_in_directory(subfolders_with_keyword_list[0])
print(test_get_list_of_files_in_directory)


[]


For now, I will use the shorter list - `subfolders_with_keyword_list`.

In [14]:
def move_file(source_path: str, label: str, id: int) -> None:
    """
    label: either "image" or "mask"
    the moved file will be named as `date_original-name`
    """
    file_name = source_path.split("/")[-1]
    file_date = source_path.split("/")[-4]
    destination_folder = f"../prepare_dataset/{label}_directory{id}"
    destination_path = f"../prepare_dataset/{label}_directory{id}/{file_date}_{file_name}"

    if not os.path.isdir(destination_folder):
        os.makedirs(os.path.dirname(destination_path))

    if os.path.isfile(destination_path):
        print("File exists.")
        return

    shutil.copy(source_path, destination_path)
    print(f"File copied to destination: {destination_path}.")

test_move_file = move_file(test_get_list_of_files_in_directory[1], "image", 1)


IndexError: list index out of range

#### Batch move files

There are two things we need to do:

1. Construct a set so that geotif under the same folder gets the same `id` when `move_file()` gets executed.
2. `mask.tif` should get the label `mask`, others get `image`.

In [15]:
def batch_move_files(source_path_list: list) -> None:
    path_dict = {}

    for i in range(len(source_path_list)):
        current_path = source_path_list[i]
        current_folder = current_path.rsplit("/", 1)[0] # split on the last occurrence

        if current_folder not in path_dict:
            path_dict[current_folder] = len(path_dict)

        current_id = path_dict[current_folder]

        if "mask" in current_path:
            move_file(current_path, "mask", current_id)
        else:
            print(f"image: {current_path}")
            move_file(current_path, "image", current_id)

batch_move_files(test_get_list_of_files_in_directory)
