# Generate Statistic and TIFFs for Site Location Analysis

In [1]:
import os
import json
import numpy as np
import pandas as pd

import initialise
import common
from data_extract_utils import sort_key

In [2]:
alphabet = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ')
np_alphabet = np.array(alphabet)

# Group columns prefix
group_prefix = "AU_"

# Samples files
input_samples = os.path.join(common.DATASETS_DIR, 'australia_samples_365days_v3.csv')
output_samples = os.path.join(common.DATASETS_DIR, 'australia_samples_365days.csv')

#random seed
seed = 1234
np.random.seed(seed)

In [3]:
all_samples = pd.read_csv(input_samples, index_col=0)
all_sites = all_samples[['Site', 'Longitude', 'Latitude', 'Czone3']].drop_duplicates().set_index('Site')

In [4]:
sites = all_sites.copy()
lat_factor = 1
lon_factor = 1
sites['Latitude'] = np.floor(all_sites.Latitude / lat_factor) * lat_factor
sites['Longitude'] = np.floor(all_sites.Longitude / lon_factor) * lon_factor
groups = sites.groupby(['Latitude', 'Longitude'], as_index=False).size()

np_codes = np.random.choice(np_alphabet, [len(groups), 2])
codes = ["".join([group_prefix] + list(np_codes[i])) for i in range(len(np_codes))]
assert (len(codes) == len(set(codes)))
groups['Group1'] = codes
sites = sites.reset_index().merge(groups, how='left').set_index('Site')
all_sites = all_sites.join(sites.Group1)

In [5]:
sites = all_sites.copy()
lat_factor = 2
lon_factor = 2
sites['Latitude'] = np.ceil(all_sites.Latitude / lat_factor) * lat_factor
sites['Longitude'] = np.floor(all_sites.Longitude / lon_factor) * lon_factor
groups = sites.groupby(['Latitude', 'Longitude', 'Czone3'], as_index=False).size()

np_codes = np.random.choice(np_alphabet, [len(groups), 2])
codes = ["".join([group_prefix] + list(np_codes[i])) for i in range(len(np_codes))]
assert (len(codes) == len(set(codes)))
groups['Group2'] = codes
sites = sites.reset_index().merge(groups, how='left').set_index('Site')
all_sites = all_sites.join(sites.Group2)

In [6]:
old_sites = all_samples[['Latitude', 'Longitude', 'Czone3', 'Site', 'Group1']].drop_duplicates('Site').set_index('Site')
old_sites.groupby('Group1').size().sort_values()
old_sites.merge(all_sites[['Group1']], left_index=True, right_index=True).sort_values('Group1_x').drop_duplicates(['Group1_x', 'Group1_y'])

Unnamed: 0_level_0,Latitude,Longitude,Czone3,Group1_x,Group1_y
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C11_11,-37.47708,145.23275,Cfb,CC,AU_YZ
C10_7,-42.84792,147.48628,Cfb,CT,AU_PT
C10_17,-17.06458,125.26265,BSh,EF,AU_QD
C10_4,-38.22708,145.56676,Cfb,FS,AU_GV
C18_3,-35.60625,148.8631,Cfb,KF,AU_QJ
C10_5,-33.68125,117.61153,Csb,KX,AU_DH
C11_6,-36.26875,146.64838,Csb,LZ,AU_LM
C10_11,-35.26875,150.40931,Cfb,MG,AU_SQ
C10_14,-35.23958,141.22344,BSk,MI,AU_QF
C18_1,-35.27708,149.05183,Cfb,NY,AU_WP


In [7]:
old_sites = all_samples[['Latitude', 'Longitude', 'Czone3', 'Site', 'Group2']].drop_duplicates('Site').set_index('Site')
old_sites.groupby('Group2').size().sort_values()
old_sites.merge(all_sites[['Group2']], left_index=True, right_index=True).sort_values('Group2_x').drop_duplicates(['Group2_x', 'Group2_y'])

Unnamed: 0_level_0,Latitude,Longitude,Czone3,Group2_x,Group2_y
Site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C10_21,-35.20625,149.02369,Cfb,AK,AU_AM
C11_17,-33.74375,150.38789,Cfb,CR,AU_NQ
C10_14,-35.23958,141.22344,BSk,EJ,AU_NU
C10_13,-17.03125,125.11829,Aw,FD,AU_YM
C11_13,-36.13958,147.33528,Csb,FF,AU_HK
C11_5,-36.13125,146.61286,Csa,JI,AU_OT
C10_4,-38.22708,145.56676,Cfb,KE,AU_LT
C10_19,-34.21875,116.38647,Csb,KG,AU_RX
C10_9,-26.16458,121.56263,BWh,KI,AU_JI
C11_2,-33.65625,150.61524,Cfa,OB,AU_ZU


In [8]:
all_samples = all_samples.drop(columns=['Group1', 'Group2']).join(all_sites[['Group1', 'Group2']], on='Site')[
    ['Latitude', 'Longitude', 'Sampling date', 'Sampling year', 'LC Category', 'Land Cover', 'LFMC value', 'Site',
     'Czone1', 'Czone2', 'Czone3',
     'Group1', 'Group2',
     'Day_sin', 'Day_cos',
     'Long_sin', 'Long_cos', 'Lat_norm', 'Elevation', 'Slope', 'Aspect_sin', 'Aspect_cos']]
all_samples = all_samples.reset_index().sort_values('ID', key=lambda x: x.apply(sort_key)).set_index('ID')
all_samples

Unnamed: 0_level_0,Latitude,Longitude,Sampling date,Sampling year,LC Category,Land Cover,LFMC value,Site,Czone1,Czone2,...,Group2,Day_sin,Day_cos,Long_sin,Long_cos,Lat_norm,Elevation,Slope,Aspect_sin,Aspect_cos
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C10_1_1,-37.63542,144.22103,20/10/2008,2008,Grassland,Grassland,260.57000,C10_1,C,Cf,...,AU_LR,0.94560,-0.32534,0.58466,-0.81128,0.29091,0.08333,0.02654,-0.54559,0.83805
C10_1_2,-37.63542,144.22103,10/11/2008,2008,Grassland,Grassland,162.34000,C10_1,C,Cf,...,AU_LR,0.76941,-0.63875,0.58466,-0.81128,0.29091,0.08333,0.02654,-0.54559,0.83805
C10_1_3,-37.63542,144.22103,1/12/2008,2008,Grassland,Grassland,132.66000,C10_1,C,Cf,...,AU_LR,0.49378,-0.86959,0.58466,-0.81128,0.29091,0.08333,0.02654,-0.54559,0.83805
C10_1_4,-37.63542,144.22103,19/01/2009,2009,Grassland,Grassland,95.81000,C10_1,C,Cf,...,AU_LR,-0.30492,-0.95238,0.58466,-0.81128,0.29091,0.08333,0.02654,-0.54559,0.83805
C10_2_1,-35.40625,149.80151,5/01/2006,2006,Agriculture,Mosaic cropland (>50%) / natural vegetation (t...,63.00000,C10_2,C,Cf,...,AU_AM,-0.06880,-0.99763,0.50300,-0.86429,0.30330,0.11424,0.02920,-0.80444,0.59404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C18_3_22,-35.60625,148.86310,23/12/2015,2015,Forest,"Tree cover, broadleaved, evergreen, closed to ...",163.75463,C18_3,C,Cf,...,AU_AM,0.15431,-0.98802,0.51708,-0.85593,0.30219,0.21129,0.16830,-0.98912,-0.14709
C18_3_25,-35.60625,148.86310,18/01/2016,2016,Forest,"Tree cover, broadleaved, evergreen, closed to ...",126.33867,C18_3,C,Cf,...,AU_AM,-0.28848,-0.95749,0.51708,-0.85593,0.30219,0.21129,0.16830,-0.98912,-0.14709
C18_3_28,-35.60625,148.86310,16/02/2016,2016,Forest,"Tree cover, broadleaved, evergreen, closed to ...",136.38340,C18_3,C,Cf,...,AU_AM,-0.71166,-0.70253,0.51708,-0.85593,0.30219,0.21129,0.16830,-0.98912,-0.14709
C18_3_31,-35.60625,148.86310,2/09/2016,2016,Forest,"Tree cover, broadleaved, evergreen, closed to ...",145.09527,C18_3,C,Cf,...,AU_AM,0.88001,0.47495,0.51708,-0.85593,0.30219,0.21129,0.16830,-0.98912,-0.14709


In [9]:
all_samples.to_csv(output_samples)