# Validata preparation

In [4]:
import pandas as pd
import geopandas as gpd

## Swiss data

In [2]:
validata_path = '/Users/marinasiebold/Library/Mobile Documents/com~apple~CloudDocs/Studium/Bird_Research/01_Data/validata_ornitho.ch_2023.csv'
master_path = '/Users/marinasiebold/Library/Mobile Documents/com~apple~CloudDocs/Studium/Bird_Research/01_Data/master_bird_data_selected_species.csv'
eea_shapefile_path = '/Users/marinasiebold/Downloads/eea_v_3035_50_km_eea-ref-grid-europe_p_2018_v01_r00 2/inspire_compatible_grid_50km.shp'
target_path = '/Users/marinasiebold/Library/Mobile Documents/com~apple~CloudDocs/Studium/Bird_Research/01_Data/validata_prepared.csv'

In [5]:
ch_data = pd.read_csv(validata_path, delimiter=';')
master_data = pd.read_csv(master_path, low_memory=False)

In [47]:
# Align feature names
ch_data.columns = ch_data.columns.str.lower()
ch_data.rename({'atlas_code_ch': 'atlas_code'}, axis='columns', inplace=True)

In [48]:
# Align dtypes
ch_data.id_species = ch_data.id_species.astype('Int64')
ch_data.total_count = ch_data.total_count.astype('Int64')
ch_data.id_observer = ch_data.id_observer.astype('Int64')

In [49]:
# Align date format
def change_dateformat(date):
    d_m_y = date.split('.')
    y_m_d = '{}-{}-{}'.format(d_m_y[2], d_m_y[1], d_m_y[0])
    return y_m_d

ch_data.date = ch_data.date.apply(change_dateformat)

In [50]:
# Align precisions
precisions = {'Exakte Lokalisierung': 'precise', 
              'Kilometerquadrat': 'square', 
              'Ort': 'place'}
ch_data.precision = ch_data.precision.map(precisions)

In [51]:
# Align bird names
species_map = dict(zip(master_data.id_species, master_data.name_species))
ch_data.name_species = ch_data.id_species.map(species_map).fillna(ch_data.name_species)

In [52]:
# Assign eea grids
eea_grid = gpd.read_file(eea_shapefile_path)
eea_grid = eea_grid.to_crs('EPSG:4326')

germany_switzerland_bbox = eea_grid.cx[5.210942:15.669926, 45.614516:55.379499]
eea_grid_filtered = eea_grid[eea_grid.intersects(germany_switzerland_bbox.unary_union)]
eea_grid_filtered.reset_index(drop=True, inplace=True)

geometry = gpd.points_from_xy(ch_data['coord_lon'], ch_data['coord_lat'])
gdf = gpd.GeoDataFrame(ch_data, geometry=geometry, crs='EPSG:4326')

ch_data = gpd.sjoin(gdf, eea_grid_filtered, how='left', predicate='within')
ch_data.rename(columns={'cellcode': 'eea_grid_id'}, inplace=True)
ch_data = ch_data.drop(columns=['index_right', 'geometry', 'noforigin', 'eoforigin', 'gid'])

In [56]:
ch_data.to_csv(target_path)

## German data