## Converting annotations to pandas for PyTorch dataloader

### This file operates on two folders to generate a dataframe of annotations along with a corresponding label of 1 for tree. Annotations that exist in one folder but do not have a corresponding RGB image are ignored. Dataframe is saved as a csv for training

### Imports

In [1]:
from deepforest import utilities
import os 
import pandas as pd

### Convert all xml files within annotation folder to one dataframe

In [2]:
# Path to the folder containing XML files
folder_path = 'C:\\Users\\zhou.m\\Documents\\2023_Fall\\NeonTree\\weecology\\annotations'

# Initialize an empty list to store DataFrames
dfs_list = []

# Iterate through XML files in the folder and convert to DataFrame
for filename in os.listdir(folder_path):
    if filename.endswith('.xml'):
        xml_file_path = os.path.join(folder_path, filename)
        df = utilities.xml_to_annotations(xml_file_path)
        dfs_list.append(df)

# Concatenate all DataFrames into one final DataFrame
final_df = pd.concat(dfs_list, ignore_index=True)

# Convert all labels to 1
final_df['label'] = 1

final_df = final_df.groupby('image_path').agg(
            {'xmin': lambda x: x.tolist(),
             'ymin': lambda x: x.tolist(),
             'xmax': lambda x: x.tolist(),
             'ymax': lambda x: x.tolist(),
             'label': lambda x: x.tolist()})

# Print the final DataFrame
print(final_df)

                                                                                        xmin  \
image_path                                                                                     
10.tif                                     [76, 116, 79, 13, 33, 284, 198, 197, 250, 207,...   
12.tif                                     [209, 75, 89, 70, 22, 76, 83, 65, 150, 128, 11...   
15.tif                                     [44, 71, 392, 82, 118, 331, 428, 383, 14, 377,...   
2018_BART_4_322000_4882000_image_crop.tif  [595, 781, 977, 1103, 1125, 1024, 687, 51, 45,...   
2018_HARV_5_733000_4698000_image_crop.tif  [1, 539, 504, 546, 575, 424, 413, 475, 587, 37...   
...                                                                                      ...   
WREF_070_2019.tif                          [259, 360, 379, 217, 231, 246, 49, 361, 223, 2...   
WREF_072_2019.tif                          [343, 284, 338, 331, 215, 276, 191, 1, 105, 31...   
WREF_075_2019.tif                       

### Compare annotation image path list with existing RGB imagery

In [3]:
image_dir = 'C:\\Users\\zhou.m\\Documents\\2023_Fall\\NeonTree\\weecology\\evaluation\\RGB'

# Get a list of image files in the directory
image_files = os.listdir(image_dir)

# Extract image names from the DataFrame's index
df_image_names = final_df.index

# Filter the DataFrame to keep only the rows with image files that exist
filtered_df = final_df[df_image_names.isin(image_files)]

filtered_df

Unnamed: 0_level_0,xmin,ymin,xmax,ymax,label
image_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018_SJER_3_252000_4104000_image_628.tif,"[29, 362, 325, 334, 174, 230, 265, 153]","[1, 206, 2, 60, 6, 56, 167, 63]","[94, 400, 378, 390, 211, 254, 333, 209]","[31, 299, 62, 122, 50, 84, 232, 126]","[1, 1, 1, 1, 1, 1, 1, 1]"
2018_SJER_3_252000_4106000_image_234.tif,[92],[158],[174],[233],[1]
2018_SJER_3_252000_4106000_image_326.tif,"[138, 20, 138]","[265, 54, 27]","[178, 66, 247]","[297, 105, 149]","[1, 1, 1]"
2018_SJER_3_252000_4106000_image_66.tif,"[67, 99]","[372, 298]","[102, 206]","[400, 399]","[1, 1]"
2018_SJER_3_252000_4107000_image_372.tif,"[298, 211]","[309, 142]","[400, 324]","[400, 254]","[1, 1]"
...,...,...,...,...,...
WREF_070_2019.tif,"[259, 360, 379, 217, 231, 246, 49, 361, 223, 2...","[50, 79, 73, 56, 49, 50, 58, 12, 9, 9, 51, 104...","[271, 378, 400, 231, 242, 258, 69, 394, 260, 2...","[69, 105, 101, 70, 63, 64, 81, 43, 43, 35, 82,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
WREF_072_2019.tif,"[343, 284, 338, 331, 215, 276, 191, 1, 105, 31...","[142, 1, 81, 3, 300, 185, 362, 243, 64, 342, 3...","[400, 331, 400, 400, 302, 386, 261, 67, 158, 3...","[193, 51, 143, 77, 377, 299, 400, 348, 114, 40...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
WREF_075_2019.tif,"[314, 199, 314, 251, 336, 374, 301, 40, 286, 1...","[267, 271, 150, 177, 1, 98, 1, 65, 165, 197, 1...","[341, 235, 358, 299, 364, 400, 332, 71, 311, 2...","[297, 313, 203, 238, 30, 129, 25, 103, 198, 25...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
WREF_083_2019.tif,"[373, 130, 1, 294, 369, 2, 20, 183, 339, 206, ...","[231, 369, 366, 344, 38, 189, 90, 1, 331, 154,...","[400, 189, 17, 341, 400, 50, 78, 258, 400, 275...","[258, 400, 399, 396, 104, 234, 165, 32, 400, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [6]:
filtered_df.to_csv('annotations_filtered.csv')