# Generate Statistic and TIFFs for Site Location Analysis

In [1]:
import os
import json
import numpy as np
import pandas as pd

import initialise
import common
from data_extract_utils import sort_key

In [2]:
alphabet = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
np_alphabet = np.array(alphabet)

# Group columns prefix
group_prefix = "EU_"

# Samples files
input_samples = os.path.join(common.DATASETS_DIR, 'europe_samples_365days_v4.csv')
output_samples = os.path.join(common.DATASETS_DIR, 'europe_samples_365days.csv')

#random seed
seed = 9876
np.random.seed(seed)

In [3]:
all_samples = pd.read_csv(input_samples, index_col=0)
all_sites = all_samples[['Site', 'Longitude', 'Latitude', 'Czone3']].drop_duplicates().set_index('Site')

In [4]:
sites = all_sites.copy()
lat_factor = 1
lon_factor = 1
sites['Latitude'] = np.floor(all_sites.Latitude / lat_factor) * lat_factor
sites['Longitude'] = np.floor(all_sites.Longitude / lon_factor) * lon_factor
groups = sites.groupby(['Latitude', 'Longitude'], as_index=False).size()

np_codes = np.random.choice(np_alphabet, [len(groups), 2])
codes = ["".join([group_prefix] + list(np_codes[i])) for i in range(len(np_codes))]
assert (len(codes) == len(set(codes)))
groups['Group1'] = codes
sites = sites.reset_index().merge(groups, how='left').set_index('Site')
all_sites = all_sites.join(sites.Group1)

In [5]:
sites = all_sites.copy()
lat_factor = 2
lon_factor = 2
sites['Latitude'] = np.ceil(all_sites.Latitude / lat_factor) * lat_factor
sites['Longitude'] = np.floor(all_sites.Longitude / lon_factor) * lon_factor
groups = sites.groupby(['Latitude', 'Longitude', 'Czone3'], as_index=False).size()

np_codes = np.random.choice(np_alphabet, [len(groups), 2])
codes = ["".join([group_prefix] + list(np_codes[i])) for i in range(len(np_codes))]
assert (len(codes) == len(set(codes)))
groups['Group2'] = codes
sites = sites.reset_index().merge(groups, how='left').set_index('Site')
all_sites = all_sites.join(sites.Group2)

In [6]:
old_sites = all_samples[['Latitude', 'Longitude', 'Czone3', 'Site', 'Group1']].drop_duplicates('Site').set_index('Site')
old_sites.groupby('Group1').size().sort_values()
old_sites.merge(all_sites[['Group1']], left_index=True, right_index=True).sort_values('Group1_x').drop_duplicates(['Group1_x', 'Group1_y'])

Unnamed: 0_level_0,Latitude,Longitude,Czone3,Group1_x,Group1_y
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C2_56,42.70208,-7.1581,Csb,BL,EU_VD
C9_8,44.43125,3.94733,Cfb,BP,EU_QP
C2_3,38.20625,-2.30925,BSk,CY,EU_WF
C9_19,43.21875,6.36654,Csb,EI,EU_VI
C9_35,43.83542,7.34463,Csa,EM,EU_ZU
C9_31,41.65208,9.32656,Csa,GO,EU_LM
C9_29,43.88542,4.4255,Csa,HN,EU_UN
C2_13,41.31042,-1.43947,Cfb,HO,EU_GA
C9_7,42.44375,8.85065,Csa,HQ,EU_GM
C2_52,43.50208,-7.51649,Cfb,IA,EU_WI


In [7]:
old_sites = all_samples[['Latitude', 'Longitude', 'Czone3', 'Site', 'Group2']].drop_duplicates('Site').set_index('Site')
old_sites.groupby('Group2').size().sort_values()
old_sites.merge(all_sites[['Group2']], left_index=True, right_index=True).sort_values('Group2_x').drop_duplicates(['Group2_x', 'Group2_y'])

Unnamed: 0_level_0,Latitude,Longitude,Czone3,Group2_x,Group2_y
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C9_23,43.23958,5.80248,Csa,AS,EU_AY
C2_44,40.50625,-3.00032,BSk,BJ,EU_UV
C2_50,37.73125,-3.5008,Csa,BO,EU_VA
C2_13,41.31042,-1.43947,Cfb,DO,EU_YZ
C2_52,43.50208,-7.51649,Cfb,EC,EU_YP
C2_19,40.77292,-3.71108,Csa,EU,EU_DO
C2_69,39.34792,-4.4802,BSk,FI,EU_LK
C15_1,41.34375,1.05726,Csb,FK,EU_IR
C2_35,40.86875,-2.22326,Csb,FT,EU_FZ
C9_22,44.34375,4.10456,Csb,HZ,EU_FY


In [8]:
all_samples = all_samples.drop(columns=['Group1', 'Group2']).join(all_sites[['Group1', 'Group2']], on='Site')[
    ['Latitude', 'Longitude', 'Sampling date', 'Sampling year', 'LC Category', 'Land Cover', 'LFMC value', 'Site',
     'Czone1', 'Czone2', 'Czone3',
     'Group1', 'Group2',
     'Day_sin', 'Day_cos',
     'Long_sin', 'Long_cos', 'Lat_norm', 'Elevation', 'Slope', 'Aspect_sin', 'Aspect_cos']]
all_samples = all_samples.reset_index().sort_values('ID', key=lambda x: x.apply(sort_key)).set_index('ID')
all_samples

Unnamed: 0_level_0,Latitude,Longitude,Sampling date,Sampling year,Land Cover,LFMC value,Site,Czone1,Czone2,Czone3,...,Group2,Day_sin,Day_cos,Long_sin,Long_cos,Lat_norm,Elevation,Slope,Aspect_sin,Aspect_cos
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C2_1_1,38.92292,-1.71649,10/04/2006,2006,"Mosaic natural vegetation (tree, shrub, herbac...",74.33500,C2_1,B,BS,BSk,...,EU_ZQ,-0.99111,0.13301,-0.02995,0.99955,0.71624,0.15258,0.16544,0.94004,0.34108
C2_1_3,38.92292,-1.71649,25/07/2006,2006,"Mosaic natural vegetation (tree, shrub, herbac...",87.42500,C2_1,B,BS,BSk,...,EU_ZQ,0.37771,0.92592,-0.02995,0.99955,0.71624,0.15258,0.16544,0.94004,0.34108
C2_2_1,38.30625,-2.15313,11/04/2006,2006,"Tree cover, needleleaved, evergreen, closed to...",86.10800,C2_2,B,BS,BSk,...,EU_AA,-0.98868,0.15006,-0.03757,0.99929,0.71281,0.16236,0.15632,0.22862,0.97352
C2_2_10,38.30625,-2.15313,25/07/2006,2006,"Tree cover, needleleaved, evergreen, closed to...",60.51667,C2_2,B,BS,BSk,...,EU_AA,0.37771,0.92592,-0.03757,0.99929,0.71281,0.16236,0.15632,0.22862,0.97352
C2_3_1,38.20625,-2.30925,11/04/2006,2006,Mosaic tree and shrub (>50%) / herbaceous cove...,110.14500,C2_3,B,BS,BSk,...,EU_AA,-0.98868,0.15006,-0.04029,0.99919,0.71226,0.19286,0.06576,-0.99408,0.10867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C15_1_13,41.34375,1.05726,27/06/2017,2017,"Tree cover, broadleaved, deciduous, closed to ...",106.92565,C15_1,C,Cs,Csb,...,EU_IR,-0.09454,0.99552,0.01845,0.99983,0.72969,0.12533,0.20084,-0.57996,0.81464
C15_1_19,41.34375,1.05726,26/07/2017,2017,"Tree cover, broadleaved, deciduous, closed to ...",84.77083,C15_1,C,Cs,Csb,...,EU_IR,0.39359,0.91929,0.01845,0.99983,0.72969,0.12533,0.20084,-0.57996,0.81464
C15_1_25,41.34375,1.05726,9/08/2017,2017,"Tree cover, broadleaved, deciduous, closed to ...",79.33530,C15_1,C,Cs,Csb,...,EU_IR,0.60162,0.79878,0.01845,0.99983,0.72969,0.12533,0.20084,-0.57996,0.81464
C15_1_31,41.34375,1.05726,5/09/2017,2017,"Tree cover, broadleaved, deciduous, closed to ...",69.21358,C15_1,C,Cs,Csb,...,EU_IR,0.89584,0.44438,0.01845,0.99983,0.72969,0.12533,0.20084,-0.57996,0.81464


In [9]:
all_samples.to_csv(output_samples)