# Information Points Extractor
## Responsabilities of this file
 

*  This file is responsable for extracting information from rasters through the occurence point and save to a Numpy Arrays

## Who is Running?

In [None]:
project_root = "/content/drive/MyDrive/TFC_MatheusSasso"
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Variable Parameters

In [None]:
# Collect for all species or only for the taxonkey one
collect_all = False #@param {type:"boolean"}

# Geneal parameters used on this notebook
taxonKey=2874484 #@param {type:"integer"}

## Fixed Paramets

In [None]:
# Reference Data Paths
occurrences_root = project_root + "/Data/GBIF_Ocurrences"
base_csv_files_path = project_root + "/Data/Standarized_Brazil_Data/TXT_Aux_Files"


# Required parametes to instantiate Raster_Information_Collector
coorection_limit = 10 
raster_information_collector_output_dir = project_root +  "/Data/Rasters_As_Numpy_Arrays" 

## Package Downloads

In [None]:
! pip install geopandas --quiet
! pip install rasterio --quiet

## Imports

In [None]:
import os
import gc
import rasterio
import numpy as np
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
import matplotlib.pyplot as plt
from matplotlib import pyplot
from typing import List,Tuple
from sklearn.utils import Bunch

import geopandas as gpd
from rasterio.plot import show
from rasterio.windows import Window
from osgeo import gdal

## Getting Specie Name

In [None]:
!wget https://raw.githubusercontent.com/climate-and-health-datasci-Unicamp/permapy/main/utils/species_taxon_id_dict.py
from species_taxon_id_dict import *
species_name =  species_taxon_id_dict[taxonKey]
species_name

--2020-12-23 18:39:42--  https://raw.githubusercontent.com/climate-and-health-datasci-Unicamp/permapy/main/utils/species_taxon_id_dict.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2147 (2.1K) [text/plain]
Saving to: ‘species_taxon_id_dict.py.2’


2020-12-23 18:39:42 (37.6 MB/s) - ‘species_taxon_id_dict.py.2’ saved [2147/2147]



'Cajanus cajam'

## Retrieving aux Classes

In [None]:
!wget https://raw.githubusercontent.com/climate-and-health-datasci-Unicamp/permapy/main/utils/utils.py
!wget https://raw.githubusercontent.com/climate-and-health-datasci-Unicamp/permapy/main/utils/raster_utils.py

--2020-12-23 18:39:42--  https://raw.githubusercontent.com/climate-and-health-datasci-Unicamp/permapy/main/utils/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 799 [text/plain]
Saving to: ‘utils.py.2’


2020-12-23 18:39:42 (49.9 MB/s) - ‘utils.py.2’ saved [799/799]

--2020-12-23 18:39:42--  https://raw.githubusercontent.com/climate-and-health-datasci-Unicamp/permapy/main/utils/raster_utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4622 (4.5K) [text/plain]
Saving to: ‘raster_utils.py.2’


2020-12-23 18:39:43 (48.2 MB/s) - 

In [None]:
from utils import Utils
from raster_utils import Raster_Utils

raster_utils = Raster_Utils()
utils_methods = Utils()

## Creating Information Collector Output Folder

In [None]:
utils_methods.create_folder_structure(raster_information_collector_output_dir)

## Raster_Information_Collector


In [None]:
# from typing import List,Tuple
# import gc
# from sklearn.utils import Bunch
# import geopandas as gpd
# import rasterio
# import numpy as np
# from rasterio.plot import show
# from rasterio.windows import Window
# from osgeo import gdal
# from matplotlib import pyplot

class Raster_Information_Collector:
  """
  This class is reponsable for extracting data from rasters on GBIF occurrence locations

  Attributes
  ----------
  raster_base_configs : str
      Directory to save coverages
  coorection_limit : int
      Limit of iterations to correct no information points
  raster_standards : object
      Raster standards object
  """

  def __init__(self, output_dir:str,raster_utils,utils_methods,coorection_limit:int=10):
      """
      Parameters
      ----------
      raster_base_configs : str
          Directory to save coverages
      coorection_limit : int
          Limit of iterations to correct no information points
      raster_utils : object
          Raster standards object
      """

      self.output_dir = output_dir
      self.raster_utils = raster_utils
      self.coorection_limit = coorection_limit
      self.utils_methods = utils_methods


  def _fill_peristent_no_data_values_with_median_value(self,raster_occurrences_array):
      """ For grids that still with empty value after the board points treatment, this function fill it with the mean value"""

      median_value = np.median(raster_occurrences_array[[raster_occurrences_array!=self.raster_utils.no_data_val]])
      for i,elem in enumerate(raster_occurrences_array):
        if elem == self.raster_utils.no_data_val:
          raster_occurrences_array[i] = median_value
      
      return raster_occurrences_array

  def save_coverges_to_numpy(self,specie_dir:str,species_name:str,root_raster_files_list:List[str]):
    """ Save all extracted to a numpy array"""

    data = gpd.read_file(specie_dir)
    coordinates = np.array((np.array(data['LATITUDE']),np.array(data['LONGITUDE']))).T
        
    # determine coverage values for each of the training & testing points
    Long = coordinates[:,1]
    Lat = coordinates[:,0]

    all_env_values_list = []
    for i,fp in enumerate(root_raster_files_list):
        
        # Exctraction occurences from rasters. As each raster file can have a 
        # different resolution, ix and iy are calculated in every step.
        raster_array,_,xgrid,ygrid,_,_ = self.raster_utils.get_raster_infos(fp)
        ix = np.searchsorted(xgrid,Long)
        iy = np.searchsorted(ygrid,Lat)
        raster_occurrences_array = raster_array[-iy, ix].T
        
        #treating cases where points that should be inside country are outside
        del raster_array

        #tretaing cases that still with no data values
        raster_occurrences_array= self._fill_peristent_no_data_values_with_median_value(raster_occurrences_array)
        
        #selecting the env value on the occurrence position
        all_env_values_list.append(raster_occurrences_array)

        del raster_occurrences_array
        del ix
        del iy
        gc.collect()
        

    coverage= np.stack([value for value in all_env_values_list]).T
    del all_env_values_list
    gc.collect() 

    self.utils_methods.save_nparray_to_folder(coverage,self.output_dir,species_name)
    
    del coverage
    gc.collect()
   

## List rasters locations


In [None]:
list_raster_files = open(f'{base_csv_files_path}/list_raster_files.txt', 'r').read().splitlines()
list_names_raster = open(f'{base_csv_files_path}/list_names_raster.txt', 'r').read().splitlines()

## Performing Collection

Creating collector instance

In [None]:
raster_collector = Raster_Information_Collector(output_dir=raster_information_collector_output_dir,raster_utils=raster_utils,utils_methods=utils_methods,coorection_limit=coorection_limit)

Performing collections for each plant

In [None]:
if not collect_all:
  specie_shp_path = os.path.join(occurrences_root,species_name,species_name+".shp")
  raster_collector.save_coverges_to_numpy(specie_dir=specie_shp_path,
                                          species_name=species_name,
                                          root_raster_files_list=list_raster_files)

Checking the response format

In [None]:
if not collect_all:
  saved_numpy_array = utils_methods.retrieve_data_from_np_array(raster_information_collector_output_dir+'/'+species_name + '.npy')
  saved_numpy_array.shape,saved_numpy_array

## Executing pipeline step for all studied species

In [None]:
if collect_all:
  for tax_id, species_name in species_taxon_id_dict.items():
      specie_shp_path = os.path.join(occurrences_root,species_name,species_name+".shp")
      raster_collector.save_coverges_to_numpy(specie_dir=specie_shp_path,
                                          species_name=species_name,
                                          root_raster_files_list=list_raster_files)

Reading raster bio1_annual_mean_temperature.tif




Reading raster bio2_mean_diurnal_range.tif
Reading raster bio3_isothermality.tif
Reading raster bio4_temperature_seasonality.tif
Reading raster bio5_max_temperature_of_warmest_month.tif
Reading raster bio6_min_temperature_of_coldest_month.tif
Reading raster bio7_temperature_annual_range.tif
Reading raster bio8_mean_temperature_of_wettest_quarter.tif
Reading raster bio9_mean_temperature_of_driest_quarter.tif
Reading raster bio10_mean_temperature_of_warmest_quarter.tif
Reading raster bio11_mean_temperature_of_coldest_quarter.tif
Reading raster bio12_annual_precipitation.tif
Reading raster bio13_precipitation_of_wettest_month.tif
Reading raster bio14_precipitation_of_driest_month.tif
Reading raster bio15_precipitation_seasonality.tif
Reading raster bio16_precipitation_of_wettest_quarter.tif
Reading raster bio17_precipitation_of_driest_quarter.tif
Reading raster bio18_precipitation_of_warmest_quarter.tif
Reading raster bio19_precipitation_of_coldest_quarter.tif
Reading raster elev1_strm_wo