# Generate Statistic and TIFFs for Site Location Analysis

In [1]:
import os
import json
import numpy as np
import pandas as pd

import initialise
import common
from data_extract_utils import sort_key

In [2]:
alphabet = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
np_alphabet = np.array(alphabet)

# Group columns prefix
group_prefix = "US_"

# Samples files
input_samples = os.path.join(common.DATASETS_DIR, 'samples_365days_v4.csv')
output_samples = os.path.join(common.DATASETS_DIR, 'samples_365days.csv')

#random seed
seed = 8642
np.random.seed(seed)

In [3]:
all_samples = pd.read_csv(input_samples, index_col=0)
all_sites = all_samples[['Site', 'Longitude', 'Latitude', 'Czone3']].drop_duplicates().set_index('Site')

In [4]:
sites = all_sites.copy()
lat_factor = 1
lon_factor = 1
sites['Latitude'] = np.floor(all_sites.Latitude / lat_factor) * lat_factor
sites['Longitude'] = np.floor(all_sites.Longitude / lon_factor) * lon_factor
groups = sites.groupby(['Latitude', 'Longitude'], as_index=False).size()

np_codes = np.random.choice(np_alphabet, [len(groups), 4])
codes = ["".join([group_prefix] + list(np_codes[i])) for i in range(len(np_codes))]
assert (len(codes) == len(set(codes)))
groups['Group1'] = codes
sites = sites.reset_index().merge(groups, how='left').set_index('Site')
all_sites = all_sites.join(sites.Group1)

In [5]:
sites = all_sites.copy()
lat_factor = 2
lon_factor = 2
sites['Latitude'] = np.ceil(all_sites.Latitude / lat_factor) * lat_factor
sites['Longitude'] = np.floor(all_sites.Longitude / lon_factor) * lon_factor
groups = sites.groupby(['Latitude', 'Longitude', 'Czone3'], as_index=False).size()

np_codes = np.random.choice(np_alphabet, [len(groups), 4])
codes = ["".join([group_prefix] + list(np_codes[i])) for i in range(len(np_codes))]
assert (len(codes) == len(set(codes)))
groups['Group2'] = codes
sites = sites.reset_index().merge(groups, how='left').set_index('Site')
all_sites = all_sites.join(sites.Group2)

In [6]:
old_sites = all_samples[['Latitude', 'Longitude', 'Czone3', 'Site', 'Group1']].drop_duplicates('Site').set_index('Site')
old_sites.groupby('Group1').size().sort_values()
old_sites.merge(all_sites[['Group1']], left_index=True, right_index=True).sort_values('Group1_x').drop_duplicates(['Group1_x', 'Group1_y'])

Unnamed: 0_level_0,Latitude,Longitude,Czone3,Group1_x,Group1_y
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C6_401,31.50625,-99.66036,BSh,ABK,US_PIPI
C6_288,35.19792,-82.61021,Cfa,AET,US_MHWH
C6_654,43.84792,-107.92191,BSk,ALH,US_OZSI
C6_721,32.35208,-100.08819,BSk,ANG,US_AXKQ
C6_691,42.70208,-106.37363,BSk,AOC,US_FJES
...,...,...,...,...,...
C6_166,29.86042,-98.50924,Cfa,ZVO,US_CEFN
C6_137,35.17708,-106.37961,Dfb,ZWE,US_OTEN
C6_154,32.75208,-110.05246,BSk,ZWN,US_HPMR
C6_542,37.57292,-112.79910,Dsc,ZYY,US_HEPH


In [7]:
old_sites = all_samples[['Latitude', 'Longitude', 'Czone3', 'Site', 'Group2']].drop_duplicates('Site').set_index('Site')
old_sites.groupby('Group2').size().sort_values()
old_sites.merge(all_sites[['Group2']], left_index=True, right_index=True).sort_values('Group2_x').drop_duplicates(['Group2_x', 'Group2_y'])

Unnamed: 0_level_0,Latitude,Longitude,Czone3,Group2_x,Group2_y
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C6_245,39.52292,-105.72907,Dfc,AAM,US_KCCI
C6_580,36.00625,-121.44557,Csb,ACI,US_AFJT
C6_499,33.35208,-105.67709,Cfb,AGQ,US_GRRD
C6_483,36.34375,-115.64848,Dsb,AGU,US_DTKX
C6_482,32.31875,-106.58420,BWk,AVV,US_YSWK
...,...,...,...,...,...
C6_773,41.71875,-121.71145,BSk,ZGB,US_VTYJ
C6_112,36.73542,-105.49946,Dfc,ZIX,US_POWA
C6_194,34.28958,-117.36201,Csa,ZJL,US_EFQQ
C6_752,31.31042,-94.82439,Cfa,ZKA,US_OVKP


In [8]:
all_samples = all_samples.drop(columns=['Group1', 'Group2']).join(all_sites[['Group1', 'Group2']], on='Site')[
    ['Latitude', 'Longitude', 'Sampling date', 'Sampling year', 'Land Cover', 'LFMC value', 'Site',
     'Czone1', 'Czone2', 'Czone3',
     'Group1', 'Group2',
     'Day_sin', 'Day_cos',
     'Long_sin', 'Long_cos', 'Lat_norm', 'Elevation', 'Slope', 'Aspect_sin', 'Aspect_cos']]
all_samples = all_samples.reset_index().sort_values('ID', key=lambda x: x.apply(sort_key)).set_index('ID')
all_samples

Unnamed: 0_level_0,Latitude,Longitude,Sampling date,Sampling year,Land Cover,LFMC value,Site,Czone1,Czone2,Czone3,...,Group2,Day_sin,Day_cos,Long_sin,Long_cos,Lat_norm,Elevation,Slope,Aspect_sin,Aspect_cos
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C4_1_1,40.21458,-112.21868,20/06/2005,2005,Shrubland,156.76300,C4_1,B,BS,BSk,...,US_FQFH,-0.21352,0.97694,-0.92575,-0.37814,0.72341,0.26207,0.01972,-0.03023,0.99954
C4_1_2,40.21458,-112.21868,5/07/2005,2005,Shrubland,128.27700,C4_1,B,BS,BSk,...,US_FQFH,0.04302,0.99907,-0.92575,-0.37814,0.72341,0.26207,0.01972,-0.03023,0.99954
C4_1_3,40.21458,-112.21868,21/07/2005,2005,Shrubland,92.48200,C4_1,B,BS,BSk,...,US_FQFH,0.31311,0.94972,-0.92575,-0.37814,0.72341,0.26207,0.01972,-0.03023,0.99954
C4_1_4,40.21458,-112.21868,8/08/2005,2005,Shrubland,82.09300,C4_1,B,BS,BSk,...,US_FQFH,0.58779,0.80902,-0.92575,-0.37814,0.72341,0.26207,0.01972,-0.03023,0.99954
C4_1_5,40.21458,-112.21868,23/08/2005,2005,Shrubland,78.95300,C4_1,B,BS,BSk,...,US_FQFH,0.77488,0.63210,-0.92575,-0.37814,0.72341,0.26207,0.01972,-0.03023,0.99954
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C13_4_14,46.89791,-113.43535,28/08/2012,2012,"Tree cover, needleleaved, evergreen, closed (>...",102.44207,C13_4,B,BS,BSk,...,US_XAFG,0.83593,0.54884,-0.91751,-0.39771,0.76054,0.21077,0.07241,0.53863,0.84254
C13_4_15,46.89791,-113.43535,4/09/2012,2012,"Tree cover, needleleaved, evergreen, closed (>...",88.76436,C13_4,B,BS,BSk,...,US_XAFG,0.89584,0.44438,-0.91751,-0.39771,0.76054,0.21077,0.07241,0.53863,0.84254
C13_4_16,46.89791,-113.43535,11/09/2012,2012,"Tree cover, needleleaved, evergreen, closed (>...",88.79382,C13_4,B,BS,BSk,...,US_XAFG,0.94276,0.33347,-0.91751,-0.39771,0.76054,0.21077,0.07241,0.53863,0.84254
C13_4_17,46.89791,-113.43535,18/09/2012,2012,"Tree cover, needleleaved, evergreen, closed (>...",81.72345,C13_4,B,BS,BSk,...,US_XAFG,0.97601,0.21772,-0.91751,-0.39771,0.76054,0.21077,0.07241,0.53863,0.84254


In [10]:
all_samples.to_csv(output_samples)