In [1]:
import xml.etree.ElementTree
import os
import pandas as pd
import collections

In [2]:
data_paths = [
    "./data/ImageCLEF2013PlantTaskTrainPackage-PART-1/train/",
    "./data/ImageCLEF2013PlantTaskTrainPackage-PART-2/train/"
]

files = [file for data_path in data_paths for file in os.scandir(data_path)]

In [9]:
def getIdFromFilename(file_name):
    return int(file_name.split(".")[0])

def getFileTypeFromFilename(file_name):
    if(file_name.endswith("xml")):
        return "xml"
    elif file_name.endswith("jpg"):
        return "jpg"
    else:
        return "unknown"
    
data = pd.DataFrame([(getIdFromFilename(file.name), file.path) for file in files], columns = ["id", "path"])

data["f_type"] = data.path.map(getFileTypeFromFilename)

data = data.pivot("id", columns = ["f_type"], values = ["path"])

data.columns = data.columns.droplevel(0)
data.columns.name = ""

data

Unnamed: 0_level_0,jpg,xml
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
1,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
2,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
3,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
4,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
...,...,...
36301,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
36305,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
36306,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...
36307,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,./data/ImageCLEF2013PlantTaskTrainPackage-PART...


In [10]:
PlantImageData = collections.namedtuple(
    "PlantImageData",
    "content species"
)

def parseXMLData(xml_path):
    tree = xml.etree.ElementTree.parse(xml_path)
    
    return PlantImageData(
        tree.find("./Content").text,
        tree.find("./Taxon/Species").text,
    )

In [11]:
data[["content", "species"]] = pd.DataFrame(data.xml.map(parseXMLData).tolist())
data = data.dropna().drop("xml", axis = 1).copy()

In [12]:
data

Unnamed: 0_level_0,jpg,content,species
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Corylus avellana L.
1,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Ruscus aculeatus L.
2,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Phillyrea angustifolia L.
3,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Rhamnus alaternus L.
4,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Hedera helix L.
...,...,...,...
20976,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Olea europaea L.
20977,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Stem,Liquidambar styraciflua L.
20978,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Albizia julibrissin Durazz.
20982,./data/ImageCLEF2013PlantTaskTrainPackage-PART...,Leaf,Cotinus coggygria Scop.


In [13]:
data.content.value_counts()

Leaf      7703
Flower    2027
Entire     815
Fruit      799
Stem       790
Name: content, dtype: int64

In [14]:
data.species.value_counts()

Quercus ilex L.                                          264
Ulmus minor Mill.                                        240
Viburnum tinus L.                                        223
Hedera helix L.                                          202
Cercis siliquastrum L.                                   200
                                                        ... 
Matthiola sinuata (L.) R.Br.                               7
Plantago media L.                                          7
Blechnum spicant (L.) Sm.                                  6
Gentiana pneumonanthe L.                                   6
Parthenocissus tricuspidata (Siebold & Zucc.) Planch.      6
Name: species, Length: 245, dtype: int64

In [15]:
data.to_json("data/data.json")