In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [36]:
from shapely.geometry import Point
import matplotlib.pyplot as plt
from rasterio.mask import mask
from rasterio.plot import show
from rasterio.plot import plotting_extent
from natsort import natsorted
import gdal
import pandas as pd
import numpy as np
import geopandas
import rasterio
import pycrs

file_dir=r'C:/Users/Mark.Rademaker/PycharmProjects/InternshipNaturalis/trait-geo-diverse-dl/concept proof'

In [3]:
#access file with list of taxa names
taxa=pd.read_csv(file_dir+"/data/spec_filtered/taxa.txt",header=None)
taxa.columns=["taxon"]

species_occ_dict={}

for i in taxa["taxon"]:
    taxon_data = pd.read_csv(file_dir+"/data/spec_filtered/%s.csv"%i)
    #add species dataframe to dict
    species_occ_dict["%s"%i] = taxon_data  
    #check whether all species have been included and inspect dictionary
if len(species_occ_dict.keys())==len(taxa["taxon"]):
    print("All species dataframes now in dictionary")
else:
    print("Error: not all species dataframe included")

All species dataframes now in dictionary


#### Part 1
- First read in occurrence data
- Create a copy that we can use in the original state later
- Create a buffer around each occurrence point, merge it into a single polygon
- Clip the environmental raster based on this extend

In [None]:
for key in species_occ_dict:    
    #load occurrence data and set initial projection
    data=species_occ_dict[key]
    print(data.columns)
    spec = key


    data['coordinates'] = list(zip(data["decimal_longitude"], data["decimal_latitude"]))
    data['coordinates'] = data["coordinates"].apply(Point)
    data["present/pseudo_absent"]=1
    geo_data=geopandas.GeoDataFrame(data, geometry='coordinates',crs={'init' :'epsg:4326'})

    #change projection to azimuthal equidistant to calculate 1000km buffer around point
    geo_data = geo_data.to_crs({'init': 'esri:54032'}) 
    buffer=geo_data.buffer(1000*1000)
    buffer=buffer.to_crs(epsg=4326)

    #create single large polygon from individual buffers
    union_buffer=buffer.unary_union

    #first clip the raster based on this extend 
    raster=rasterio.open(file_dir+'/data/GIS/env_stacked/ENVIREM_BIOCLIM_stacked.tif')
    #specify output tif:
    out_tif = file_dir+'/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec

    #clip the raster:
    out_img, out_transform = mask(dataset=raster, shapes=[union_buffer],crop=True)
   
    # Copy the metadata
    out_meta = raster.meta.copy()

    # Parse EPSG code
    epsg_code = int(raster.crs.data['init'][5:])
    out_meta.update({"driver": "GTiff",
                     "height": out_img.shape[1],
                     "width": out_img.shape[2],
                     "transform": out_transform,
                     "crs": pycrs.parse.from_epsg_code(epsg_code).to_proj4()})

    with rasterio.open(out_tif, "w", **out_meta) as dest:
            dest.write(out_img)

Inspect whether clip was correct

In [None]:
#Inspect the first band of the clipped raster for all species
for key in species_occ_dict:
    ##### Extract occurrence point to plot on the raster (see if correct area was clipped)
    data=species_occ_dict[key]
    print(len(data.index))
    spec = key
    data['coordinates'] = list(zip(data["decimal_longitude"], data["decimal_latitude"]))
    data['coordinates'] = data["coordinates"].apply(Point)
    geo_data=geopandas.GeoDataFrame(data, geometry='coordinates',crs={'init' :'epsg:4326'})
    ####open the clipped raster
    clipped = rasterio.open(file_dir+'/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)
    array = clipped.read(1)
    array_data = clipped.read(1,masked=True)
    array_meta = clipped.profile
   
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.imshow(array_data,cmap="gist_earth",interpolation="none",vmin=0,
    # Here you must set the spatial extent or else the data will not line up with your geopandas layer
    extent=plotting_extent(clipped),)
    spec_plots_points=geo_data["coordinates"]
    spec_plots_points.plot(ax=ax,
                       marker='o',
                       markersize=20,
                       color='red')
    ax.set_title("%s \n Raster clip and occurrence points"%spec,
             fontsize=20)
    plt.show()
#Works!

#### Part 2
- now that we have the clipped raster we can use it to try and make a random selection of pseudo absence points
- we first open the raster
- then we separate those cells that actually contain pixel values (excluding the sea)
- we calculate the longitude and latitude of the centre point of these cells <br>
  (the environmental variable values do not vary within each cell so it doesn't matter if each points is in the centre)
- we make a random selection of 1000 positions (in line with Hendrix & Vos)
- we add the longitude and latitude values of these to to the dataset and export it  

In [None]:
for key in species_occ_dict:    
    #lon_lat presence points
    presence_data = species_occ_dict[key]
    presence_data["present/pseudo_absent"]=1
    spec = key
    long=presence_data["decimal_longitude"]
    lati=presence_data["decimal_latitude"]
    long=pd.Series.tolist(long)
    lati=pd.Series.tolist(lati)
   
    
    
    #read raster
    src=rasterio.open(file_dir+'/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)
    array=src.read_masks(1)
    
    #set raster cell mask values of presence locations to 1
    for i in range(0,len(presence_data)):
        row,col=src.index(long[i],lati[i])
        array[row,col]=1
    
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.imshow(array,cmap="gray")
    ax.set_title("%s"%spec,
             fontsize=20)
    plt.show()
    print(len(presence_data), "number of presences")
    
    (y_index, x_index) = np.nonzero(array > 1)

    #sample random locations from raster excluding sea and presence cells
    r = gdal.Open(file_dir+'/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)
    (upper_left_x, x_size, x_rotation, upper_left_y, y_rotation, y_size) = r.GetGeoTransform()
    x_coords = x_index * x_size + upper_left_x + (x_size / 2) #add half the cell size
    y_coords = y_index * y_size + upper_left_y + (y_size / 2) #to centre the point


    lon_lat_array=np.stack((x_coords,y_coords)).T

    random_sample_size=1000
    random_sample_lon_lats=lon_lat_array[np.random.choice(lon_lat_array.shape[0], 1000, replace=False), :] ##
    print(len(random_sample_lon_lats), "number of pseudo absences")

    #Add random points to dataset
    lon=[]
    lat=[]
    psa=[0]*random_sample_size
    taxon=["%s"%spec]*random_sample_size
    gbif=["no_id"]*random_sample_size

    for item in random_sample_lon_lats:
        longitude=item[0]
        latitude=item[1]
        lon.append(longitude)
        lat.append(latitude)

    ###Dataset including 10.000 pseudo-absence points for capriolus capriolus
    new_data=pd.DataFrame({"gbif_id": gbif,"taxon_name":taxon,"decimal_longitude": lon, "decimal_latitude":lat, "present/pseudo_absent": psa})
    data=pd.concat([presence_data,new_data],ignore_index=True)
    data=data[['taxon_name','gbif_id','decimal_longitude','decimal_latitude','present/pseudo_absent']]
    data["row_n"]=np.arange(len(data))
     
    long=data["decimal_longitude"]
    lati=data["decimal_latitude"]
    long=pd.Series.tolist(long)
    lati=pd.Series.tolist(lati)
    
    print(len(data),"lenght data with pseudo absences pre-filtering")
    
    #read raster
    src=rasterio.open(file_dir+'/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)
    array=src.read_masks(1)
    
    ##remove potential presence locations in the sea##
    for i in range(1,42):
        array=src.read_masks(i)
        for i in range(0,len(data)):
            row,col=src.index(long[i],lati[i])
            if array[row,col] ==0:
                data=data[data.row_n != i]     
    print(len(data), "length data with pseudo absences post-filtering")
    
    
    data=data.reset_index(drop=True)
    data.to_csv(file_dir + "/data/spec_occ/%s_occ_dataframe.csv"%spec)


In [None]:
data=pd.read_csv(file_dir+'/data/spec_occ_env/Vicagna_vicugna_env_dataframe.csv')
data

#### Part 3 
- finally we can extract the environmental variable values underneath the occurrence and pseudo-absence points
- we need to scale these environmental values for later training by taking their mean and std_dev
- below is a code snippet, but because it requires a long time to run in jupyter the process is best split (see extract_env_variables1-4.py files)

In [None]:
raster=rasterio.open(file_dir+'/data/GIS/env_stacked/ENVIREM_BIOCLIM_stacked.tif')
array = raster.read()

with open(file_dir+'/data/GIS/env_bio_mean_std.txt','w+') as file:
    file.write("band"+"\t"+"mean"+"\t"+"std_dev"+"\n")
    file.close()


min_max=0

for band in array:
    minb=np.min(band)
    if minb < min_max:
        min_max=minb


for i in range(1,42):
    print(i)
    profile.update(count=1)
    band=raster.read(i)
    band[band == -9999] = min_max
    band_masked = np.ma.masked_array(band, mask=(band == min_max))
    
    mean=band_masked.mean()
    std_dev=np.std(band_masked)
    with open(file_dir+'/data/GIS/env_bio_mean_std.txt','a') as file:
        file.write(str(i)+"\t"+str(mean)+"\t"+str(std_dev)+"\n")


In [4]:
##Subset the dataframe into four parts
#access file with list of taxa names
taxa=pd.read_csv(file_dir+"/data/spec_filtered/taxa.txt",header=None)
taxa.columns=["taxon"]

In [28]:
import gdal

for i in taxa["taxon"]:
    data = pd.read_csv(file_dir+"/data/spec_occ/%s_occ_dataframe.csv"%i)
    
    spec = data["taxon_name"][0]
    spec = spec.replace(" ","_")
    
    print("processing species ", spec)
    

    #get all col and row values for species locations 
    len_pd=np.arange(len(data))
    long=data["decimal_longitude"]
    lati=data["decimal_latitude"]
    ppa=data["present/pseudo_absent"]

    lon=long.values
    lat=lati.values

    row=[]
    col=[]

    src=rasterio.open(file_dir+'/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)

    for i in len_pd:
        row_n, col_n = src.index(lon[i], lat[i])# spatial --> image coordinates
        row.append(row_n)
        col.append(col_n)

    ##opening raster as 3d numpy array
    inRas=gdal.Open(file_dir+'/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)
    myarray=inRas.ReadAsArray()
    #print(myarray.shape)
    #print(type(myarray))

    #collect file with mean and std_dev for each band
    mean_std=pd.read_csv(file_dir+'/data/GIS/env_bio_mean_std.txt',sep="\t")
    mean_std=mean_std.to_numpy()


    ########################################################
    #extract the values for all bands and prepare input data
    ########################################################
    X=[]
    species =["%s"%spec]*int(len(row))

    for j in range(0,41):
        band=myarray[j]
        x=[]

        for i in range(0,len(row)):
            value= band[row[i],col[i]]
            if value <-1000:
                value=np.nan
                x.append(value)
            else:
                value = ((value - mean_std.item((j,1))) / mean_std.item((j,2)))#scale values
                x.append(value)
        X.append(x)


    #set as numpy 2d array
    X =np.array([np.array(xi) for xi in X])

    #transform into dataframe and include row and column values
    df=pd.DataFrame(X)
    df=df.T

    df["present/pseudo_absent"]=ppa
    df["decimal_latitude"]=lati
    df["decimal_longitude"]=long
    df["taxon_name"]=species
    df["present/pseudo_absent"]=ppa
    df["row_n"]=row
    
    df=df.dropna(axis=0, how='any')
    input_data=df
    ##save input dataframe
    input_data.to_csv(file_dir +"/data/spec_occ_env/%s_env_dataframe.csv"%spec)

processing species  Aepyceros_melampus
processing species  Alcelaphus_buselaphus
processing species  Alcelaphus_caama
processing species  Alces_alces
processing species  Alces_americanus
processing species  Ammotragus_lervia
processing species  Antidorcas_marsupialis
processing species  Antilocapra_americana
processing species  Antilope_cervicapra
processing species  Axis_axis
processing species  Axis_porcinus
processing species  Bison_bison
processing species  Bison_bonasus
processing species  Blastocerus_dichotomus
processing species  Bos_frontalis_gaurus
processing species  Bos_grunniens_mutus
processing species  Bos_javanicus
processing species  Bos_taurus_primigenius
processing species  Boselaphus_tragocamelus
processing species  Bubalus_bubalis_arnee
processing species  Budorcas_taxicolor
processing species  Camelus_bactrianus
processing species  Camelus_dromedarius
processing species  Capra_hircus_aegagrus
processing species  Capra_ibex
processing species  Capra_nubiana
processi

experimental code, check whether keeping this is necessary later, first create dataframe with all locations in world, <br>
then extract environmental values at all these cells and store into array for later DNN prediction

In [None]:
 ###Dataset of world map including all locations with data-values (to later predict presence-pseudoabsence on)
    src=rasterio.open(file_dir+'/data/GIS/env_stacked/ENVIREM_BIOCLIM_stacked.tif')
    array=src.read_masks(1)
    
    r = gdal.Open(file_dir+'/data/GIS/env_stacked/ENVIREM_BIOCLIM_stacked.tif')
    (y_index, x_index) = np.nonzero(array > 0)
    (upper_left_x, x_size, x_rotation, upper_left_y, y_rotation, y_size) = r.GetGeoTransform()
    x_coords = x_index * x_size + upper_left_x + (x_size / 2) #add half the cell size
    y_coords = y_index * y_size + upper_left_y + (y_size / 2) #to centre the point

    lon_lat_array=np.stack((x_coords,y_coords)).T

    lon=[]
    lat=[]

    for item in lon_lat_array:
        longitude=item[0]
        latitude=item[1]
        lon.append(longitude)
        lat.append(latitude)

    taxon=["%s"%spec]*len(lon)

    data_to_pred=pd.DataFrame({"decimal_longitude":lon,"decimal_latitude":lat})
    print(len(data_to_pred), "number of points to predict")
    data_to_pred.to_csv(file_dir + "/data/GIS/world_locations_to_predict.csv")


In [34]:
import gdal

##opening raster as 3d numpy array
inRas=gdal.Open(file_dir+'/data/GIS/env_stacked/ENVIREM_BIOCLIM_stacked.tif')
myarray=inRas.ReadAsArray()
print(myarray.shape)
print(type(myarray))

#get all col and row values for all cells on land 
df=pd.read_csv(file_dir+'/data/GIS/world_locations_to_predict.csv')
len_pd=np.arange(len(df))
print(len_pd)
lon=df["decimal_longitude"]
lat=df["decimal_latitude"]
lon=lon.values
lat=lat.values

row=[]
col=[]

src=rasterio.open(file_dir+'/data/GIS/env_stacked/ENVIREM_BIOCLIM_stacked.tif')

for i in len_pd:
    row_n, col_n = src.index(lon[i], lat[i])# spatial --> image coordinates
    row.append(row_n)
    col.append(col_n)

#collect file with mean and std_dev for each band
mean_std=pd.read_csv(file_dir+'/data/GIS/env_bio_mean_std.txt',sep="\t")
mean_std=mean_std.to_numpy()


########################################################
#extract the values for all bands and prepare input data
########################################################
X=[]

for j in range(0,41):
    print(j)
    band=myarray[j]
    x=[]

    for i in range(0,len(row)):
        value= band[row[i],col[i]]
        if value <-1000:
            value=np.nan
            x.append(value)
        else:
            value = ((value - mean_std.item((j,1))) / mean_std.item((j,2)))#scale values
            x.append(value)
    X.append(x)

#include row and column values
X.append(row)
X.append(col)
#set as numpy 2d array
X =np.array([np.array(xi) for xi in X])

df=pd.DataFrame(X)

df=df.T
df=df.dropna(axis=0, how='any')
input_X=df.loc[:,0:40]


row=df[41]
col=df[42]

row_col=pd.DataFrame({"row":row,"col":col})

#convert dataframe back to numpy array
input_X=input_X.values
#convert rows and col indices back to array
row=row.values
col=col.values

#save
prediction_array=np.save(file_dir+'/data/GIS/world_prediction_array.npy',input_X)
prediction_pandas=row_col.to_csv(file_dir+'/data/GIS/world_prediction_row_col.csv')

(41, 1800, 4320)
<class 'numpy.ndarray'>
[      0       1       2 ... 2179454 2179455 2179456]
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
