Starting with importing all packages needed for this notebook.

In [None]:
from shapely.affinity import scale
from shapely.geometry import Point, Polygon, MultiPolygon, MultiPoint
import pandas as pd
from pygbif.species import name_backbone
import geopandas as gpd
import fiona
from numpy import shape
from pathlib import Path
file_dir=('/Users/maddie/Projects/CPSC_597/data')

The first part of pre-processing is to then create a dictonary of all the unique species from the main CSV and separate them into data-frames.

In [2]:
#Create the dataframes to be concatenated and filtered
occ_all_species = pd.read_csv(file_dir+"/data_raw/gbif_data_raw/occurrences_all_species.csv", low_memory= False)
df = occ_all_species[occ_all_species['label'].str.contains(" ")]

#Get unique label names
unique_labels=df["label"].unique()

names = []
back_key =[]
remaining_labels=[]

#Get backbone associated species names and taxon keys
for item in unique_labels:
    if "species" in name_backbone(item):
        i = name_backbone(item)['species']
        j = name_backbone(item)['speciesKey']
        names.append(i)
        back_key.append(j)
    else:
        remaining_labels.append(item)
        
for item in remaining_labels:
    value=name_backbone(item)['taxonKey']
    back_key.append(value)
    names.append(item)
    
#Put into DataFrame
df=pd.DataFrame({"label": unique_labels,"back_key": back_key,"species": names},columns=["label","back_key","species"])

Dropping all rows from the dataframes that do not have an entry in either the latitude column, the longitude column, or both.

In [3]:
#Concatenate with occurrence data, dataframe, drop na's 
df2=pd.merge(occ_all_species,df,how="left",on="label")

df2 = df2[pd.notnull(df2['species_x'])]
df2 = df2[pd.notnull(df2['decimalLatitude'])]
df2 = df2[pd.notnull(df2['decimalLongitude'])]

print("df2 without na's n.rows:", len(df2.index))

df2["back_key"]=df2["back_key"].astype(int)

df2 without na's n.rows: 250264


Printing out all species that made it through the first check, and the number of entries per species. 

In [4]:
#list of species
species = df2["species_x"].unique()
species.sort()

#save separate dataframe for each species as csv file 
for spec in species:
    data=df2.loc[df2['species_x'] == spec]
    if len(data.index)>= 10:
        spec=spec.replace(" ","_")
        print("%s"%spec, len(data.index))
        data.to_csv(file_dir+'/data_raw/gbif_data_raw/%s_gbif_raw.csv'%spec)


Citharichthys_sordidus 31634
Engraulis_mordax 24710
Paralichthys_californicus 5340
Scomber_japonicus 20855
Thunnus_alalunga 35567
Xiphias_gladius 132158


In this step, all of the Species are written into a list, then that list is used to select what dataframe is being used. From there all entries are filtered based on the following two critera: 

1. All latitude and longitude entries that have less than two decimal places are dropped.
2. All repeat entries are dropped. 

From there, the final list of each species and the remaining entries that match the criterion listed above are printed.

In [5]:

#create txt file with name of species included after filtering
taxa_list=open(file_dir+'/data_raw/gbif_data_raw/taxa_list.txt',"w")


#Filter occurrences per species
for spec in species:
    
    data=df2.loc[df2['species_x'] == spec] #select subset of species
    
    # check >10 observations
    if len(data.index)>= 10: 

        spec = spec.replace(" ","_")
        print("processing species %s"%spec)

        data=pd.read_csv(file_dir+'/data_raw/gbif_data_raw/%s_gbif_raw.csv'%spec, low_memory= False) #load in data
        
        ###################################################
        # check number of decimals longitude and latitude #
        ###################################################
        str_lat=(pd.Series.tolist(data["decimalLatitude"].astype(str)))
        str_lon=(pd.Series.tolist(data["decimalLongitude"].astype(str)))
        dec_lat=[]
        dec_lon=[]

        for i in range(len(str_lat)):
    
            if "e" in str_lat[i]:
                str_lat[i]="0.00"
                decla = str_lat[i].split(".")[1]
                dec_lat.append(int(len(decla)))
            else:
                decla = str_lat[i].split(".")[1]
                dec_lat.append(int(len(decla)))
#                
        for i in range(len(str_lon)):
            declo=str_lon[i].split(".")[1]
            dec_lon.append(int(len(declo)))
#    
        data["dec_lat"]=dec_lat
        data["dec_lon"]=dec_lon

        # filter only include those with min. 2 points
        data=data[data["dec_lat"] >= 2]
        data=data[data["dec_lon"] >= 2]
        print("length only including lon-lat 2 decimals",len(data.index))
#
        data['coordinates'] = list(zip(data["decimalLongitude"], data["decimalLatitude"]))
        data['lonlat'] = list(zip(data["decimalLongitude"], data["decimalLatitude"]))
        data['coordinates'] = data["coordinates"].apply(Point)

        
        #########################################
        # only keep records with unique lon-lat #
        #########################################
        
        data = data.drop_duplicates('lonlat')
       
        
        # check >10 observations
        if len(data.index)>=10:
            #save to csv
            data.to_csv(file_dir+'/modified_data/gbif_filtered/%s_filtered_data.csv'%spec)
            taxa_list.write(spec+"\n")
            print(len(data))

#close text file
taxa_list.close()
# next species!

processing species Citharichthys_sordidus
length only including lon-lat 2 decimals 31346
7674
processing species Engraulis_mordax
length only including lon-lat 2 decimals 23730
4841
processing species Paralichthys_californicus
length only including lon-lat 2 decimals 5184
3199
processing species Scomber_japonicus
length only including lon-lat 2 decimals 19433
6693
processing species Thunnus_alalunga
length only including lon-lat 2 decimals 24982
14918
processing species Xiphias_gladius
length only including lon-lat 2 decimals 80352
50913
