<img src="../../img/evaluation_data_preparation.png" alt="Evaluation data preparation" style="width: 100%; border-radius: 20px;"/>

In [1]:
%%HTML
<style>
    body {
        --vscode-font-family: "Itim"
    }
</style>

In [2]:
import sys
sys.path.append('../')
sys.path.append('../../')

import pandas as pd

from utils.data_preparation import *

## Validation data

### Swiss data

In [3]:
# Load and standardize validation data
path_validata_ch = '../../../01_Data/datasets/validata_ornitho_ch_2023.csv'
date_format = '%d.%m.%Y'  # ch: '%d.%m.%Y'; de: %m/%d/%Y

# Data we need for data preparation
path_translator_names = '../../../01_Data/translators/translation_species_names_de_vs_ch.csv'
path_eea_grids = '../../../01_Data/shp_files/grids/eea_europe_grids_50km/inspire_compatible_grid_50km.shp'

# Load and standardize validation data
validata_ch = pd.read_csv(path_validata_ch, delimiter=get_delimiter(path_validata_ch), low_memory=False)
validata_ch = standardize_data(validata_ch, 
                            date_format=date_format,
                            path_translator_species_names=path_translator_names)
validata_ch = assign_eea_grids(validata_ch, path_eea_grids)
validata_ch.total_count = validata_ch.total_count.astype(float)
validata_ch.altitude = validata_ch.altitude.astype(float)

In [4]:
# sort validata by id_validata
validata_ch = validata_ch.sort_values(by=['id_validata']).reset_index(drop=True)

In [5]:
# Load validata with ground truth
path_validata_with_ground_truth = '../../../01_Data/datasets/evaluation_data_with_gt_ch.csv'
validata_with_ground_truth = pd.read_csv(path_validata_with_ground_truth, delimiter=get_delimiter(path_validata_with_ground_truth), low_memory=False)

validata_with_ground_truth.columns = validata_with_ground_truth.columns.str.lower()
validata_with_ground_truth = validata_with_ground_truth.sort_values(by=['id_validata']).reset_index(drop=True)

ground_truth_ch = validata_with_ground_truth[['id_validata', 'change_type']]  # not using: 'date_original', 'coord_lat_original', 'coord_lon_original', 'altitude_original'


In [6]:
# Append ground truth
validata_ch = pd.merge(validata_ch, ground_truth_ch, on='id_validata', how='left')

In [7]:
# Drop unnecessary columns
validata_ch = validata_ch.drop(columns=['id_sighting', 'timing', 'precision', 'id_observer'])

In [8]:
# Rename columns, add 'edited' and 'country' column
validata_ch = validata_ch.rename(columns={'change_type': 'edit_type'})
validata_ch['edited'] = validata_ch['edit_type'].notnull().astype(int)
validata_ch['country'] = 'ch'

### German data

In [9]:
# Load and standardize validation data
path_validata = '../../../01_Data/datasets/validata_ornitho_de_2023.csv'
date_format = '%m/%d/%Y'  # ch: '%d.%m.%Y'; de: %m/%d/%Y

# Data we need for data preparation
path_translator_names = '../../../01_Data/translators/translation_species_names_de_vs_ch.csv'
path_translator_ids = '../../../01_Data/translators/translation_species_id_de_vs_ornitho.csv'
path_eea_grids = '../../../01_Data/shp_files/grids/eea_europe_grids_50km/inspire_compatible_grid_50km.shp'

In [10]:
# Load and standardize validation data
validata_de = pd.read_csv(path_validata, delimiter=get_delimiter(path_validata), low_memory=False)
validata_de = standardize_data(validata_de, 
                           path_translator_species_names=path_translator_names,
                           path_translator_species_ids=path_translator_ids,
                           date_format=date_format)
validata_de = assign_eea_grids(validata_de, path_eea_grids)
validata_de.total_count = validata_de.total_count.astype(float)
validata_de.altitude = validata_de.altitude.astype(float)

In [11]:
# Load ground truth for validation data
validata_de_gt_path = '../../../01_Data/datasets/Validata_DE_manipulierte_DS.txt'
validata_de_gt = pd.read_csv(validata_de_gt_path, delimiter=get_delimiter(validata_de_gt_path), low_memory=False)
validata_de_gt.columns = validata_de_gt.columns.str.lower()
validata_de_gt['id_validata_de'] = validata_de_gt['id_validata_de'].str.replace(',', '.').astype(float).astype(int)

In [12]:
# Append ground truth
validata_de = pd.merge(validata_de, validata_de_gt, on='id_validata_de', how='left')

In [13]:
# Drop unnecessary columns
validata_de = validata_de.drop(columns=['timing', 'precision'])

In [14]:
# Rename columns, add 'edited' and 'country' column
validata_de = validata_de.rename(columns={'id_validata_de': 'id_validata'})
validata_de['edited'] = validata_de['edit_type'].notnull().astype(int)
validata_de['country'] = 'de'

In [15]:
edit_types = {'Habitat': 'habitat (coord)',
              'Verbreitung': 'distribution (coord)',
              'Phänologie': 'date',
              'Anzahl': 'count'}
validata_de.edit_type = validata_de.edit_type.map(edit_types)

In [17]:
# one sighting has nan as coord_lon. We will drop it
validata_de = validata_de.dropna(subset=['coord_lon'])

### Merge swiss and german validata

In [19]:
master_validata = pd.concat([validata_de, validata_ch])

In [20]:
master_validata.to_csv('master_validata.csv')

## Training data

In [21]:
path_train = '../../../01_Data/datasets/land_use_on_coord.csv'
train_data = pd.read_csv(path_train, index_col=0, low_memory=False).reset_index(drop=True)

In [23]:
# Drop unnecessary columns
train_data = train_data.drop(columns=['timing', 'precision', 'id_observer', 'Code_18'])

In [25]:
# Change column types to int
train_data.id_species = train_data.id_species.astype(int)
train_data.total_count = train_data.total_count.fillna(1).astype(int)

In [29]:
train_data.to_csv('master_train.csv')