#### Creating the species dataframes with environmental variables
In order to create the dataframes the pixels corresponding with the long and lat values of all the presence and pseudo-absence points in the clipped raster must be found and their values extracted. This process can take very long if run in a single for-loop for all species. Therefore it was chopped into four separate loops that can be run simultaneously.

In [13]:
import pandas as pd
import rasterio

file_dir=r'C:/Users/M-RAM/PycharmProjects/InternshipNaturalis/github_trait_geo_diverse_dl/trait-geo-diverse-dl'

First we make a dictionary of the species occurrence data

In [None]:
#access file with list of taxa names
taxa=pd.read_csv(file_dir+"/data/spec_filtered/taxa.txt",header=None)
taxa.columns=["taxon"]

In [None]:
species_occ_dict={}

for i in taxa["taxon"]:
    taxon_data = pd.read_csv(file_dir+"/data/spec_filtered/%s.csv"%i)
    #add species dataframe to dict
    species_occ_dict["%s"%i] = taxon_data  
    #check whether all species have been included and inspect dictionary
if len(species_occ_dict.keys())==len(taxa["taxon"]):
    print("All species dataframes now in dictionary")
else:
    print("Error: not all species dataframe included")

Now we create a dataframe of it containing absence and pseudo-absence labels

In [None]:
#Create dataframes consisting of both the occurrence and pseudo-absence points
for key in species_occ_dict:    
    #load occurrence data and set initial projection
    data=species_occ_dict[key]
    data2=data.copy()#create copy to later store information of pseudo-absence points in
    spec = key
    data['coordinates'] = list(zip(data["decimal_longitude"], data["decimal_latitude"]))
    data['coordinates'] = data["coordinates"].apply(Point)
    data["present/pseudo_absent"]=1
    geo_data=geopandas.GeoDataFrame(data, geometry='coordinates',crs={'init' :'epsg:4326'})
    
    #change projection to azimuthal equidistant to calculate 1000km buffer around point
    geo_data = geo_data.to_crs({'init': 'esri:54032'}) 
    buffer=geo_data.buffer(1000*1000)
    buffer=buffer.to_crs(epsg=4326)
    
    #create single large polygon from individual buffers
    union_buffer=buffer.unary_union

    #generate random points in polygon
    random_points=list(generate_random_locations(len(data),union_buffer))
    random_points
    
    lat=[]
    lon=[]
    
    for item in random_points:
        dec_lat=item.y
        dec_lon=item.x
        #print(dec_lat)
        #print(dec_lon)
        lat.append(dec_lat)
        lon.append(dec_lon)
    
    #create copy of 
    data2["coordinates"]=random_points
    data2["decimal_longitude"]=lon
    data2["decimal_latitude"]=lat
    data2["present/pseudo_absent"]=0
    data = pd.concat([data, data2], ignore_index=True)
    species_occ_dict[key]=data
    data.to_csv(file_dir + "/species_occ/%s_occ_dataframe.csv"%spec)
    print("species %s processed"%spec)

Now we extract the raster values for each presence/pseudo-absence observation

In [17]:
##Subset the dataframe into four parts
#access file with list of taxa names
taxa=pd.read_csv(file_dir+"/data/spec_filtered/taxa.txt",header=None)
taxa.columns=["taxon"]
taxa1=taxa[0:35]
taxa2=taxa[35:80]
taxa3=taxa[80:120]
taxa4=taxa[120:154]

In [None]:
#Loop1
for i in taxa1["taxon"]:
    data = pd.read_csv(file_dir+"/data/species_occ/%s_occ_dataframe.csv"%i)
    spec = data["taxon_name"][0]
    spec = spec.replace(" ","_")
    src = rasterio.open(r'C:/Users/M-RAM/PycharmProjects/InternshipNaturalis/trait-geo-diverse-dl/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)

     # extract longitude and latitude and store them in normal list (as opposed to pandas Series)
    lon = data["decimal_longitude"]
    lat = data["decimal_latitude"]
    lat = pd.Series.tolist(lat)
    lon = pd.Series.tolist(lon)
    print("processing species %s" % spec)

    # go through bands iteratively
    for i in range(1, 42):
        array = src.read(i)
        band_name = "band %s" % i
        data[band_name] = None
        print("processing band %s" % i)
        for j in range(0, len(data)):
            # What is the corresponding row and column in our image?
            row, col = src.index(lon[j], lat[j])  # spatial --> image coordinates
            # print(f'row,col=\t\t({row},{col})')
            # What is the value?
            value = array[row, col]
            data[band_name][j] = value
    data.to_csv(file_dir + "/data/species_occ_env/%s_env_dataframe.csv" % spec)


In [None]:
#Loop2
for i in taxa2["taxon"]:
    data = pd.read_csv(file_dir+"/data/species_occ/%s_occ_dataframe.csv"%i)
    spec = data["taxon_name"][0]
    spec = spec.replace(" ","_")
    src = rasterio.open(r'C:/Users/M-RAM/PycharmProjects/InternshipNaturalis/trait-geo-diverse-dl/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)

     # extract longitude and latitude and store them in normal list (as opposed to pandas Series)
    lon = data["decimal_longitude"]
    lat = data["decimal_latitude"]
    lat = pd.Series.tolist(lat)
    lon = pd.Series.tolist(lon)
    print("processing species %s" % spec)

    # go through bands iteratively
    for i in range(1, 42):
        array = src.read(i)
        band_name = "band %s" % i
        data[band_name] = None
        print("processing band %s" % i)
        for j in range(0, len(data)):
            # What is the corresponding row and column in our image?
            row, col = src.index(lon[j], lat[j])  # spatial --> image coordinates
            # print(f'row,col=\t\t({row},{col})')
            # What is the value?
            value = array[row, col]
            data[band_name][j] = value
    data.to_csv(file_dir + "/data/species_occ_env/%s_env_dataframe.csv" % spec)


In [None]:
#Loop3
for i in taxa3["taxon"]:
    data = pd.read_csv(file_dir+"/data/species_occ/%s_occ_dataframe.csv"%i)
    spec = data["taxon_name"][0]
    spec = spec.replace(" ","_")
    src = rasterio.open(r'C:/Users/M-RAM/PycharmProjects/InternshipNaturalis/trait-geo-diverse-dl/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)

     # extract longitude and latitude and store them in normal list (as opposed to pandas Series)
    lon = data["decimal_longitude"]
    lat = data["decimal_latitude"]
    lat = pd.Series.tolist(lat)
    lon = pd.Series.tolist(lon)
    print("processing species %s" % spec)

    # go through bands iteratively
    for i in range(1, 42):
        array = src.read(i)
        band_name = "band %s" % i
        data[band_name] = None
        print("processing band %s" % i)
        for j in range(0, len(data)):
            # What is the corresponding row and column in our image?
            row, col = src.index(lon[j], lat[j])  # spatial --> image coordinates
            # print(f'row,col=\t\t({row},{col})')
            # What is the value?
            value = array[row, col]
            data[band_name][j] = value
    data.to_csv(file_dir + "/data/species_occ_env/%s_env_dataframe.csv" % spec)


In [None]:
#Loop4
for i in taxa4["taxon"]:
    data = pd.read_csv(file_dir+"/data/species_occ/%s_occ_dataframe.csv"%i)
    spec = data["taxon_name"][0]
    spec = spec.replace(" ","_")
    src = rasterio.open(r'C:/Users/M-RAM/PycharmProjects/InternshipNaturalis/trait-geo-diverse-dl/data/GIS/spec_stacked_raster_clip/%s_raster_clip.tif'%spec)

     # extract longitude and latitude and store them in normal list (as opposed to pandas Series)
    lon = data["decimal_longitude"]
    lat = data["decimal_latitude"]
    lat = pd.Series.tolist(lat)
    lon = pd.Series.tolist(lon)
    print("processing species %s" % spec)

    # go through bands iteratively
    for i in range(1, 42):
        array = src.read(i)
        band_name = "band %s" % i
        data[band_name] = None
        print("processing band %s" % i)
        for j in range(0, len(data)):
            # What is the corresponding row and column in our image?
            row, col = src.index(lon[j], lat[j])  # spatial --> image coordinates
            # print(f'row,col=\t\t({row},{col})')
            # What is the value?
            value = array[row, col]
            data[band_name][j] = value
    data.to_csv(file_dir + "/data/species_occ_env/%s_env_dataframe.csv" % spec)


Check the resulting dataframe

In [14]:
df=pd.read_csv(file_dir+'/data/spec_occ_env/Boselaphus_tragocamelus_env_dataframe.csv')
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,gbif_id,taxon_name,decimal_latitude,decimal_longitude,coordinates,present/pseudo_absent,band 1,band 2,...,band 32,band 33,band 34,band 35,band 36,band 37,band 38,band 39,band 40,band 41
0,0,0,1571064828,Boselaphus tragocamelus,28.513069,77.291772,POINT (77.29177199999999 28.513069),1,1.227790e+00,1.390000e+02,...,233.0,30.0,130.0,39.0,6689.0,406.0,74.0,332.0,300.0,227.0
1,1,1,1802615996,Boselaphus tragocamelus,27.165736,77.527256,POINT (77.52725600000001 27.165736),1,1.443463e+00,1.320000e+02,...,238.0,19.0,135.0,39.0,6646.0,417.0,74.0,343.0,297.0,239.0
2,2,2,1571064878,Boselaphus tragocamelus,28.572998,77.127537,POINT (77.127537 28.572998),1,6.283186e+00,1.390000e+02,...,224.0,26.0,131.0,39.0,6722.0,404.0,71.0,333.0,299.0,166.0
3,3,3,1322993675,Boselaphus tragocamelus,28.545820,77.170346,POINT (77.170346 28.54582),1,9.379119e-01,1.390000e+02,...,218.0,25.0,130.0,39.0,6734.0,405.0,73.0,332.0,299.0,226.0
4,4,4,1802616013,Boselaphus tragocamelus,26.075089,76.349026,POINT (76.34902599999999 26.075089),1,5.167471e+00,1.370000e+02,...,65.0,18.0,138.0,41.0,6045.0,419.0,83.0,336.0,288.0,245.0
5,5,5,665794823,Boselaphus tragocamelus,26.766670,79.033333,POINT (79.033333 26.76667),1,3.461976e+00,1.370000e+02,...,314.0,34.0,138.0,40.0,6494.0,423.0,78.0,345.0,296.0,241.0
6,6,6,920758332,Boselaphus tragocamelus,23.663864,81.013351,POINT (81.013351 23.663864),1,7.486848e-01,1.500000e+02,...,123.0,54.0,131.0,39.0,5455.0,410.0,80.0,330.0,265.0,288.0
7,7,7,1677281472,Boselaphus tragocamelus,26.192186,76.865422,POINT (76.865422 26.192186),1,1.637615e+00,1.260000e+02,...,314.0,22.0,139.0,41.0,6187.0,414.0,75.0,339.0,284.0,238.0
8,8,8,1571063290,Boselaphus tragocamelus,28.474005,77.317034,POINT (77.31703399999999 28.474005),1,1.802171e+00,1.390000e+02,...,217.0,27.0,130.0,39.0,6677.0,405.0,73.0,332.0,299.0,227.0
9,9,9,665879064,Boselaphus tragocamelus,27.183331,78.016670,POINT (78.01666999999999 27.183331),1,1.226831e+00,1.390000e+02,...,271.0,29.0,133.0,38.0,6657.0,417.0,73.0,344.0,297.0,238.0
