# **Lecospy Data Munging**

## Notation:
Throughout this notebook, variables starting with <code>print(img_)</code> are UAV-based information (data, filepaths, etc) and variables starting with grd_ are related to data collected from the ground.

Also, some other naming conventions for variables with data transformations:
* `robust` in a variable name refers to data treated by center according to the median and scaling by teh inter-quartile range (a la sklearns RobustScaler)
* `minmax` (and its ilk) are min-max scaled data, i.e. scaled to the interval [0,1] by subtracting the minimum and dividing by the range.
* `standard(ized)` refers to data treated with with the z-score transform by centring using the mean and scaling y the standard deviation (like sklearns StandardScaler)
* `corrected` means that a linear transformation has been applied to account for differences in sensor calibration.
* `raw` refers to having no transformations applied
* `clipped` means that outliers have been clipped to the upper and lower fence values based on the Inter-Quartile Range method. 
* `imputed` means that outliers have been removed and imputed
* `dropped` means that dataframe rows containing outliers have been removed

Example: `img_robust_indices` refers to vegetation indices from the UAV images treated with the robust scaler. 

### Setting Working Directory

In [2]:
"""
Sets working directory as "../lecoscopy/"
"""
import os
os.chdir('../')
print(os.getcwd())

import spyndex as spy
from scipy.interpolate import CubicSpline 
import numpy as np
from scipy.stats.mstats import winsorize

/Users/kalyankhatiwada/lecospy


### Defining Data Locations

In [3]:
import pandas as pd
grd_speclib = pd.read_csv("Data/C_001_SC3_Cleaned_SpectralLib.csv")
grd_speclib.dropna(subset = ["Functional_group1"], inplace=True)
print(len(grd_speclib))
grd_speclib.head()




1343


  grd_speclib = pd.read_csv("Data/C_001_SC3_Cleaned_SpectralLib.csv")


Unnamed: 0.1,Unnamed: 0,ScanID,Area,Code_name,Species_name,Functional_group1,Functional_group2,Species_name_Freq,Functional_group1_Freq,Functional_group2_Freq,...,Radiometric.Calibration,Units,Latitude,Longitude,Altitude,GPS.Time,Satellites,Calibrated.Reference.Correction.File,Channels,ScanNum
0,1,aleoch_Murph_061,Murphy,aleoch,Alectoria ochroleuca,Lichen,LightTerrestrialMacrolichen,6.0,453.0,118.0,...,,,,,,,,,,
1,2,aleoch_Murph_063,Murphy,aleoch,Alectoria ochroleuca,Lichen,LightTerrestrialMacrolichen,6.0,453.0,118.0,...,,,,,,,,,,
2,3,aleoch_Murph_064,Murphy,aleoch,Alectoria ochroleuca,Lichen,LightTerrestrialMacrolichen,6.0,453.0,118.0,...,,,,,,,,,,
3,4,aleoch_Murph_065,Murphy,aleoch,Alectoria ochroleuca,Lichen,LightTerrestrialMacrolichen,6.0,453.0,118.0,...,,,,,,,,,,
4,5,aleoch_Murph_066,Murphy,aleoch,Alectoria ochroleuca,Lichen,LightTerrestrialMacrolichen,6.0,453.0,118.0,...,,,,,,,,,,


In [4]:
img_speclib = pd.read_csv("Data/PFT_Image_SpectralLib_Clean.csv", header=0, low_memory=False)
img_speclib.head()


Unnamed: 0.1,Unnamed: 0,UID,ScanNum,sample_name,PFT,FncGrp1,X398,X399,X400,X401,...,X990,X991,X992,X993,X994,X995,X996,X997,X998,X999
0,1,BisonGulchPFTsBetula1,1,spec_1,Betula,TreeBroadleaf,0.05243,0.045161,0.039098,0.034829,...,0.563683,0.571786,0.56324,0.54851,0.538068,0.540019,0.556112,0.587042,0.633502,0.696187
1,2,BisonGulchPFTsBetula1,1,spec_2,Betula,TreeBroadleaf,0.032806,0.032797,0.03279,0.032783,...,0.465257,0.465524,0.465757,0.46596,0.466138,0.466296,0.46644,0.466572,0.466699,0.466825
2,3,BisonGulchPFTsBetula1,1,spec_3,Betula,TreeBroadleaf,0.024152,0.024453,0.024753,0.025051,...,0.471305,0.470406,0.469606,0.468903,0.468295,0.467775,0.46733,0.466943,0.4666,0.466283
3,4,BisonGulchPFTsBetula1,1,spec_4,Betula,TreeBroadleaf,0.030132,0.03042,0.030709,0.030979,...,0.428292,0.431782,0.438075,0.447661,0.461028,0.478373,0.499107,0.52251,0.547862,0.574442
4,5,BisonGulchPFTsBetula1,1,spec_5,Betula,TreeBroadleaf,0.027987,0.028189,0.028389,0.028585,...,0.434414,0.435332,0.436237,0.437123,0.437982,0.43881,0.439612,0.440393,0.441159,0.441917


In [5]:
grd_bands = grd_speclib.drop(columns=['Unnamed: 0',
        'ScanID',
        'Area',
        'Code_name',
        'Species_name',
        'Functional_group1',
        'Functional_group2',
        'Species_name_Freq',
        'Functional_group1_Freq',
        'Functional_group2_Freq',
        'Genus',
        'Version',
        'File.Name',
        'Instrument',
        'Detectors',
        'Measurement',
        'Date',
        'Time',
        'Battery.Voltage',
        'Averages',
        'Integration1',
        'Integration2',
        'Integration3',
        'Dark.Mode',
        'Foreoptic',
        'Radiometric.Calibration',
        'Units',
        'Latitude',
        'Longitude',
        'Altitude',
        'GPS.Time',
        'Satellites',
        'Calibrated.Reference.Correction.File',
        'Channels',
        'ScanNum'])

img_bands = img_speclib.drop(columns=[
        "Unnamed: 0",
    	"UID",
        "ScanNum",
    	"sample_name",
    	"PFT",
    	"FncGrp1"])
img_bands.head()

Unnamed: 0,X398,X399,X400,X401,X402,X403,X404,X405,X406,X407,...,X990,X991,X992,X993,X994,X995,X996,X997,X998,X999
0,0.05243,0.045161,0.039098,0.034829,0.032859,0.032877,0.034097,0.035726,0.037113,0.038184,...,0.563683,0.571786,0.56324,0.54851,0.538068,0.540019,0.556112,0.587042,0.633502,0.696187
1,0.032806,0.032797,0.03279,0.032783,0.032776,0.032769,0.032762,0.032755,0.032747,0.03274,...,0.465257,0.465524,0.465757,0.46596,0.466138,0.466296,0.46644,0.466572,0.466699,0.466825
2,0.024152,0.024453,0.024753,0.025051,0.025347,0.02564,0.02593,0.026219,0.026505,0.02679,...,0.471305,0.470406,0.469606,0.468903,0.468295,0.467775,0.46733,0.466943,0.4666,0.466283
3,0.030132,0.03042,0.030709,0.030979,0.031215,0.031402,0.031534,0.031601,0.031596,0.031526,...,0.428292,0.431782,0.438075,0.447661,0.461028,0.478373,0.499107,0.52251,0.547862,0.574442
4,0.027987,0.028189,0.028389,0.028585,0.028777,0.028963,0.029143,0.029315,0.02948,0.029636,...,0.434414,0.435332,0.436237,0.437123,0.437982,0.43881,0.439612,0.440393,0.441159,0.441917


### Getting vegetation indices

In [6]:
indices_to_calculate = ["NDVI","GNDVI", "AVI", "BNDVI", "CVI", "DVI", "DVIplus", "ExGR", "FCVI", "GARI", "GBNDVI", "GOSAVI", "GRNDVI", "GRVI", "GSAVI", "IPVI", "MGRVI", "MNLI", "MRBVI", "MSAVI", "MTVI1", "MTVI2", "NDVI705", "NDREI", "NDDI", "NDGI", "ND705", "MTCI", "MSR705", "MSR", "MCARIOSAVI705", "MCARIOSAVI", "MCARI705", "MCARI1" ,"MCARI2", "MCARI", "IRECI", "IKAW", "GM1", "GM2", "GLI", "GEMI", "GCC", "ExR", "ExG", "ExGR", "CIG", "CIRE", "BCC", "MGRVI", "MNLI", "MRBVI", "MSAVI", "MSR", "MSR705", "MTCI", "MTVI1", "MTVI2", "ND705", "NDDI", "NDGI", "NDREI", "NDVI705", "NDYI", "NGRDI", "NIRv", "NLI", "NormG", "NormNIR", "NormR", "OSAVI", "PSRI", "RCC", "RDVI", "REDSI", "RENDVI", "RGBVI", "RGRI", "RI", "RVI", "S2REP", "SARVI", "SAVI", "SI", "SIPI", "SR", "SR2", "SR555" , "SR705", "TCARI", "TCARIOSAVI", "TCARIOSAVI705", "TCI", "TDVI", "TGI", "TRRVI", "TVI", "TriVI", "VARI", "VARI700", "VI700", "VIG", "mND705", "mSR705"]
grd_indices = spy.computeIndex(
    index = indices_to_calculate,
    params = {
        "N": grd_bands["890"],
        "R": grd_bands["685"],
        "A" : grd_bands["550"],
        "G": grd_bands["540"],
        "G" : grd_bands["500"],
        "L" : 0.5,
        "RE1": grd_bands["705"],
        "RE2" : grd_bands["750"],
        "RE3": grd_bands["758"],
        "B" : grd_bands["480"],
        "lambdaN" : grd_bands["900"],
        "lambdaR" : grd_bands["650"],
        "lambdaG" : grd_bands["560"]
    }
)
img_indices = spy.computeIndex(
    index = indices_to_calculate,
    params = {
        "N": img_bands["X890"],
        "R": img_bands["X685"],
        "A" : img_bands["X550"],
        "G": img_bands["X540"],
        "G" : img_bands["X500"],
        "L" : 0.5,
        "RE1": img_bands["X705"],
        "RE2" : img_bands["X750"],
        "RE3": img_bands["X758"],
        "B" : img_bands["X480"],
        "lambdaN" : img_bands["X900"],
        "lambdaR" : img_bands["X650"],
        "lambdaG" : img_bands["X560"]
    }
)
grd_indices.head()

Unnamed: 0,NDVI,GNDVI,AVI,BNDVI,CVI,DVI,DVIplus,ExGR,FCVI,GARI,...,TGI,TRRVI,TVI,TriVI,VARI,VARI700,VI700,VIG,mND705,mSR705
0,0.239575,0.394829,,0.407701,3.258889,9.9765,-2.399951,-13.6817,13.179367,0.492114,...,-141.9245,0.14463,0.859985,413.186,-0.286623,-0.009905,0.092524,-0.171474,0.128388,0.269826
1,0.269978,0.355078,,0.360375,2.537779,8.7445,-1.586315,-7.49678,10.139967,0.667107,...,-64.1015,0.165153,0.877484,443.306,-0.170349,0.065008,0.100022,-0.094123,0.170055,0.239739
2,0.198892,0.294267,,0.304904,2.247387,10.5446,-2.739754,-13.7841,13.282567,0.540784,...,-112.72,0.147609,0.835998,476.404,-0.180567,0.041694,0.092201,-0.101304,0.133925,0.207331
3,0.203858,0.279593,,0.290261,2.086426,11.452,-2.835785,-12.92146,13.814267,0.594159,...,-90.1515,0.151501,0.838962,554.124,-0.145834,0.059025,0.091566,-0.080313,0.145703,0.197409
4,0.205305,0.3087,,0.322026,2.362926,9.8383,-2.494809,-12.83943,12.5106,0.526088,...,-105.816,0.151956,0.839824,438.858,-0.194293,0.032037,0.092289,-0.110391,0.141724,0.221882


In [7]:
grd_indices.to_csv("Data/training/grd_indices.csv")
img_indices.to_csv("Data/training/img_indices.csv")

In [7]:
band_columns = [str(x) for x in range(350, 999)] # list comprehension
bands = [x for x in range(350, 999)] # x values
target_bands = [x for x in range(400, 1000, 5)] # 5nm bands
resampled_grd_bands = []

for _row in grd_speclib[band_columns].itertuples():
    row_arr = np.array(_row[1::]).flatten()
    cs = CubicSpline(bands,row_arr)
    resampled_grd_bands.append(cs(target_bands).tolist())

grd_resampled_bands = pd.DataFrame(resampled_grd_bands, columns=target_bands)
grd_resampled_bands.head()

Unnamed: 0,400,405,410,415,420,425,430,435,440,445,...,950,955,960,965,970,975,980,985,990,995
0,4.9184,4.5396,4.3898,4.1822,4.1154,4.422,4.8195,5.3133,5.937,6.8407,...,26.6371,26.7633,26.7391,26.8663,27.0029,27.1386,27.2932,27.4793,27.6272,27.737
1,4.9209,4.6387,4.5401,4.5383,4.4149,4.6881,5.187,5.5872,6.0946,6.8358,...,21.4864,21.5658,21.6711,21.763,21.9079,21.9533,22.0993,22.2341,22.3594,22.4753
2,6.0054,5.5654,5.3434,5.1645,4.9476,5.3684,6.0969,6.9702,8.1729,9.8383,...,31.8833,31.9068,32.0151,32.0373,32.1752,32.1059,32.2483,32.3389,32.4257,32.5085
3,6.4293,5.8117,5.7051,5.4147,5.1401,5.6483,6.5792,7.6253,9.132,11.0642,...,34.1997,34.1551,34.2741,34.1455,34.2434,34.2481,34.2911,34.317,34.3678,34.4435
4,5.2112,4.7927,4.6113,4.4532,4.352,4.7139,5.2538,6.0476,7.0947,8.5596,...,29.2012,29.2736,29.2549,29.3068,29.3666,29.4499,29.6288,29.7972,29.9326,30.0348


In [11]:
resampled_img_bands = []
img_speclib1 = img_bands
#img_speclib1.columns = range(398, 1000)
img_speclib1.head()

Unnamed: 0,398,399,400,401,402,403,404,405,406,407,...,990,991,992,993,994,995,996,997,998,999
0,0.05243,0.045161,0.039098,0.034829,0.032859,0.032877,0.034097,0.035726,0.037113,0.038184,...,0.563683,0.571786,0.56324,0.54851,0.538068,0.540019,0.556112,0.587042,0.633502,0.696187
1,0.032806,0.032797,0.03279,0.032783,0.032776,0.032769,0.032762,0.032755,0.032747,0.03274,...,0.465257,0.465524,0.465757,0.46596,0.466138,0.466296,0.46644,0.466572,0.466699,0.466825
2,0.024152,0.024453,0.024753,0.025051,0.025347,0.02564,0.02593,0.026219,0.026505,0.02679,...,0.471305,0.470406,0.469606,0.468903,0.468295,0.467775,0.46733,0.466943,0.4666,0.466283
3,0.030132,0.03042,0.030709,0.030979,0.031215,0.031402,0.031534,0.031601,0.031596,0.031526,...,0.428292,0.431782,0.438075,0.447661,0.461028,0.478373,0.499107,0.52251,0.547862,0.574442
4,0.027987,0.028189,0.028389,0.028585,0.028777,0.028963,0.029143,0.029315,0.02948,0.029636,...,0.434414,0.435332,0.436237,0.437123,0.437982,0.43881,0.439612,0.440393,0.441159,0.441917


In [15]:
band_columns = [x for x in range(400, 999)] # list comprehension
bands = [x for x in range(400, 999)] # x values
target_bands = [x for x in range(400, 1000, 5)] # 5nm bands

for _row in img_speclib1[band_columns].itertuples():
    row_arr = np.array(_row[1::]).flatten()
    cs = CubicSpline(bands,row_arr)
    resampled_img_bands.append(cs(target_bands).tolist())

img_resampled_bands = pd.DataFrame(resampled_img_bands, columns=target_bands)
img_resampled_bands.head()

Unnamed: 0,400,405,410,415,420,425,430,435,440,445,...,950,955,960,965,970,975,980,985,990,995
0,0.039098,0.035726,0.040092,0.036459,0.034616,0.039273,0.035749,0.035302,0.036827,0.036782,...,0.50099,0.361338,0.462064,0.406727,0.344254,0.550585,0.479384,0.361945,0.563683,0.540019
1,0.03279,0.032755,0.032721,0.032778,0.033127,0.033809,0.034728,0.03573,0.036671,0.037499,...,0.465692,0.459482,0.454654,0.452372,0.453671,0.457685,0.46121,0.46348,0.465257,0.466296
2,0.024753,0.026219,0.027641,0.029077,0.030583,0.032145,0.033686,0.035142,0.036462,0.037619,...,0.505757,0.504622,0.502134,0.499312,0.495986,0.491211,0.484559,0.477221,0.471305,0.467775
3,0.030709,0.031601,0.030981,0.029494,0.029388,0.031033,0.032903,0.034412,0.035367,0.036003,...,0.555945,0.499566,0.440558,0.428957,0.462049,0.494247,0.476952,0.441084,0.428292,0.478373
4,0.028389,0.029315,0.030057,0.030637,0.031129,0.031581,0.032029,0.032464,0.032864,0.033217,...,0.454485,0.442733,0.433837,0.428265,0.425781,0.425558,0.426925,0.430018,0.434414,0.43881


In [16]:
grd_raw_with_na = pd.concat([grd_resampled_bands, grd_indices], axis = 1)
grd_raw_with_na.head()

img_raw_with_na = pd.concat([img_resampled_bands, img_indices], axis = 1)
img_raw_with_na.head()

Unnamed: 0,400,405,410,415,420,425,430,435,440,445,...,TGI,TRRVI,TVI,TriVI,VARI,VARI700,VI700,VIG,mND705,mSR705
0,0.039098,0.035726,0.040092,0.036459,0.034616,0.039273,0.035749,0.035302,0.036827,0.036782,...,-0.147727,0.435419,1.143895,28.27932,-0.192996,0.52732,0.49418,-0.1161,0.573968,0.638023
1,0.03279,0.032755,0.032721,0.032778,0.033127,0.033809,0.034728,0.03573,0.036671,0.037499,...,-0.336272,0.421652,1.125897,26.540309,-0.278294,0.405152,0.430285,-0.180129,0.555922,0.630714
2,0.024753,0.026219,0.027641,0.029077,0.030583,0.032145,0.033686,0.035142,0.036462,0.037619,...,-0.561041,0.411871,1.112667,25.661079,-0.336439,0.338056,0.396572,-0.22645,0.533042,0.630924
3,0.030709,0.031601,0.030981,0.029494,0.029388,0.031033,0.032903,0.034412,0.035367,0.036003,...,-0.286673,0.438558,1.144335,28.331222,-0.227617,0.506787,0.479625,-0.136571,0.605303,0.64496
4,0.028389,0.029315,0.030057,0.030637,0.031129,0.031581,0.032029,0.032464,0.032864,0.033217,...,-0.320774,0.419143,1.132595,27.967358,-0.307786,0.37259,0.424347,-0.208547,0.557785,0.657176


In [18]:
grd_raw_with_na.replace([np.inf, -np.inf], np.nan, inplace=True)
grd_raw = grd_raw_with_na.fillna(method="ffill")
grd_raw.head()

img_raw_with_na.replace([np.inf, -np.inf], np.nan, inplace=True)
img_raw = img_raw_with_na.fillna(method="ffill")
img_raw.head()

Unnamed: 0,400,405,410,415,420,425,430,435,440,445,...,TGI,TRRVI,TVI,TriVI,VARI,VARI700,VI700,VIG,mND705,mSR705
0,0.039098,0.035726,0.040092,0.036459,0.034616,0.039273,0.035749,0.035302,0.036827,0.036782,...,-0.147727,0.435419,1.143895,28.27932,-0.192996,0.52732,0.49418,-0.1161,0.573968,0.638023
1,0.03279,0.032755,0.032721,0.032778,0.033127,0.033809,0.034728,0.03573,0.036671,0.037499,...,-0.336272,0.421652,1.125897,26.540309,-0.278294,0.405152,0.430285,-0.180129,0.555922,0.630714
2,0.024753,0.026219,0.027641,0.029077,0.030583,0.032145,0.033686,0.035142,0.036462,0.037619,...,-0.561041,0.411871,1.112667,25.661079,-0.336439,0.338056,0.396572,-0.22645,0.533042,0.630924
3,0.030709,0.031601,0.030981,0.029494,0.029388,0.031033,0.032903,0.034412,0.035367,0.036003,...,-0.286673,0.438558,1.144335,28.331222,-0.227617,0.506787,0.479625,-0.136571,0.605303,0.64496
4,0.028389,0.029315,0.030057,0.030637,0.031129,0.031581,0.032029,0.032464,0.032864,0.033217,...,-0.320774,0.419143,1.132595,27.967358,-0.307786,0.37259,0.424347,-0.208547,0.557785,0.657176
