#### Start by importing packages and setting the working directory

In [None]:
#get occurrence data for set of species
from shapely.affinity import scale
from shapely.geometry import Point
from shapely.ops import transform
from shapely.geometry import Polygon
import pandas as pd
from pygbif.species import name_backbone
from pygbif import occurrences as occ
import geopandas as gpd

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

file_dir=r'C:/Users/Mark.Rademaker/PycharmProjects/InternshipNaturalis/venv/github_trait_geo_diverse_dl/trait-geo-diverse-dl'

In [None]:
#Open shapefile
# Set filepath (fix path relative to yours)
dist = file_dir+"/data/IUCN_mammal_ranges/TERRESTRIAL_MAMMALS.shp"

# Read file using gpd.read_file()
dist_shp = gpd.read_file(dist)

#### Get backbone taxonomy code for each species

In [None]:
#Create the dataframes to be concatenated and filtered
occ_all_species = pd.read_csv(file_dir+"/data/SQL_raw_gbif/occurrences_all_species.csv")
df = occ_all_species[occ_all_species['label'].str.contains(" ")]

print("oc_all_species n.rows: ",len(occ_all_species.index))
print("df n.rows: ",len(occ_all_species.index))

#Get unique label names
labels=df["label"].tolist()
unique_labels=df["label"].unique()

names = []
back_key =[]
remaining_labels=[]

#Get backbone associated species names and taxon keys
for item in unique_labels:
    if "species" in name_backbone(item):
        i = name_backbone(item)['species']
        j = name_backbone(item)['usageKey']
        #print(item,j,i)
        names.append(i)
        back_key.append(j)
    else:
        remaining_labels.append(item)
print(len(names))
for item in remaining_labels:
    value=name_backbone(item)['usageKey']
    back_key.append(value)
    names.append(item)
print(len(names))    

In [None]:
#Put into DataFrame
df=pd.DataFrame({"label": unique_labels,"back_key": back_key,"species": names},columns=["label","back_key","species"])

In [None]:
#Concatenate with occurrence data, dataframe, drop na's 
df2=pd.merge(occ_all_species,df,how="left",on="label")
print("df2 n.rows:", len(df2.index))

df2 = df2[pd.notnull(df2['species'])]
df2 = df2[pd.notnull(df2['decimal_latitude'])]
df2 = df2[pd.notnull(df2['decimal_longitude'])]
print("df2 without na's n.rows:", len(df2.index))

df2["back_key"]=df2["back_key"].astype(int)


In [None]:
#list of species
species = df2["species"].unique()
species.sort()

#save separate dataframe for each species as csv file 
for spec in species:
    data=df2.loc[df2['species'] == spec]
    if len(data.index)>= 10:
        spec=spec.replace(" ","_")
        print("%s"%spec, len(data.index))
        data.to_csv(file_dir+'/data/SQL_raw_gbif/%s_raw_data.csv'%spec)

Filter data based on lon-lat decimals (>2), unique lon-lat locations, IUCN species range and year

In [None]:
#create txt file with name of species included after filtering
taxa_list=open(file_dir+'/data/SQL_filtered_gbif/taxa_list.txt',"w")

for spec in species:
    data=df2.loc[df2['species'] == spec]
    
    if len(data.index)>= 10:
        spec = spec.replace(" ","_")
        print("processing species %s"%spec)
        #check capriolus data as example
        data=pd.read_csv(file_dir+'/data/SQL_raw_gbif/%s_raw_data.csv'%spec)
        print("length data", len(data.index))

        #check number of decimals longitude and latitude
        str_lat=(pd.Series.tolist(data["decimal_latitude"].astype(str)))
        str_lon=(pd.Series.tolist(data["decimal_longitude"].astype(str)))
        dec_lat=[]
        dec_lon=[]

        for i in range(len(str_lat)):
       # print("row %s"%i)
            if "e" in str_lat[i]:
                str_lat[i]="0.00"
                decla = str_lat[i].split(".")[1]
                print(i, str_lat[i],decla)
                dec_lat.append(int(len(decla)))
            else:
                decla = str_lat[i].split(".")[1]
                #print(str_lat[i],decla)
                dec_lat.append(int(len(decla)))
        for i in range(len(str_lon)):
            declo=str_lon[i].split(".")[1]
            dec_lon.append(int(len(declo)))
        #x.split(".")[1] for x in str_lat]


        #dec_lon=[x.split(".")[1] for x in str_lon]
        #dec_lat=[int(len(x)) for x in dec_lat]
        #dec_lon=[int(len(x)) for x in dec_lon]
        data["dec_lat"]=dec_lat
        data["dec_lon"]=dec_lon

        #filter only include those with min. 2 points
        data=data[data["dec_lat"] >= 2]
        data=data[data["dec_lon"] >= 2]
        print("length only including lon-lat 2 decimals",len(data.index))

        ##turn lat/lon into  set of points
        #data['decimal_latitude']=data['decimal_latitude'].round(2)
        #data['decimal_longitude']=data['decimal_longitude'].round(2)
        data['coordinates'] = list(zip(data["decimal_longitude"], data["decimal_latitude"]))
        data['lonlat'] = list(zip(data["decimal_longitude"], data["decimal_latitude"]))
        data['coordinates'] = data["coordinates"].apply(Point)

        #only keep records with unique lon-lat 
        data = data.drop_duplicates('lonlat')
        print("length unique lon-lat",len(data.index))


        ###########################################
        ###########################################
        ##Access the relevant shapefiles for the species
        speci=spec.replace("_"," ")
        dist_shp_spec = dist_shp[dist_shp["binomial"]== "%s"%speci]
          poly_spec = dist_shp_spec[["geometry"]]
      ##merge the polygons
        iucn_poly_spec= poly_spec.unary_union
        Q3 = iucn_poly_spec.simplify(0.3)
        Q3

        if Q3.is_valid== False:
            Q3 = Q3.buffer(0)

        condition_list=[]

        for point in data["coordinates"]:
            output= point.within(Q3)
            condition_list.append(output)

        #keep records that are in species range
        data["in_dist_polygon"]=condition_list
        data2=data[data.in_dist_polygon == True]
        print("length in species dist polygon",len(data2.index))

        ########################################################
        #########################################################set date column to datetime format and extract year
        data2['event_date'] = pd.to_datetime(data2['event_date'])
        data2['year'] = data2['event_date'].dt.year
        data2['month']= data2['event_date'].dt.month

        #set date column to datetime format and extract year
        data2['event_date'] = pd.to_datetime(data2['event_date'])
        data2['year'] = data2['event_date'].dt.year
        data2['month']= data2['event_date'].dt.month

        #only include observations >1900
        data3=data2[data2.year >= 1900]
        print("length observationas >1900", len(data3.index))
        #save to csv
        if len(data3.index)>=10:
            data3.to_csv(file_dir+'/data/SQL_filtered_gbif/%s_filtered_data.csv'%spec)
            taxa_list.write(spec+"\n")
            
#close text file
taxa_list.close()