In [1]:
from shapely.affinity import scale
from shapely.geometry import Point
import pandas as pd
from pygbif.species import name_backbone
import geopandas as gpd


file_dir=('/Users/maddie/Projects/CPSC_597/data')

In [2]:
#Create the dataframes to be concatenated and filtered
occ_all_species = pd.read_csv(file_dir+"/data_raw/gbif_data_raw/occurrences_all_species.csv", low_memory= False)
df = occ_all_species[occ_all_species['label'].str.contains(" ")]

#Get unique label names
unique_labels=df["label"].unique()

names = []
back_key =[]
remaining_labels=[]

#Get backbone associated species names and taxon keys
for item in unique_labels:
    if "species" in name_backbone(item):
        i = name_backbone(item)['species']
        j = name_backbone(item)['speciesKey']
        names.append(i)
        back_key.append(j)
    else:
        remaining_labels.append(item)
        
for item in remaining_labels:
    value=name_backbone(item)['taxonKey']
    back_key.append(value)
    names.append(item)
    
#Put into DataFrame
df=pd.DataFrame({"label": unique_labels,"back_key": back_key,"species": names},columns=["label","back_key","species"])

AttributeError: Can only use .str accessor with string values!

In [None]:
#Concatenate with occurrence data, dataframe, drop na's 
df2=pd.merge(occ_all_species,df,how="left",on="label")

df2 = df2[pd.notnull(df2['species_x'])]
df2 = df2[pd.notnull(df2['decimalLatitude'])]
df2 = df2[pd.notnull(df2['decimalLongitude'])]

print("df2 without na's n.rows:", len(df2.index))

df2["back_key"]=df2["back_key"].astype(int)

df2 without na's n.rows: 207274


In [None]:
#list of species
species = df2["species_x"].unique()
species.sort()

#save separate dataframe for each species as csv file 
for spec in species:
    data=df2.loc[df2['species_x'] == spec]
    if len(data.index)>= 10:
        spec=spec.replace(" ","_")
        print("%s"%spec, len(data.index))
        data.to_csv(file_dir+'/data_raw/gbif_data_raw/%s_gbif_raw.csv'%spec)


Citharichthys_sordidus 27238
Engraulis_mordax 1980
Paralichthys_californicus 416
Scomber_japonicus 10875
Thunnus_alalunga 35146
Xiphias_gladius 131619


In [None]:
#Open shapefile containing Pacific Ocean range
dist = (file_dir+'/data_raw/world-ocean/ne_50m_ocean.shp')
dist_shp = gpd.read_file(dist)

#create txt file with name of species included after filtering
taxa_list=open(file_dir+'/data_raw/gbif_data_raw/taxa_list.txt',"w")


#Filter occurrences per species
for spec in species:
    
    data=df2.loc[df2['species_x'] == spec] #select subset of species
    
    # check >10 observations
    if len(data.index)>= 10: 

        spec = spec.replace(" ","_")
        print("processing species %s"%spec)

        data=pd.read_csv(file_dir+'/data_raw/gbif_data_raw/%s_gbif_raw.csv'%spec, low_memory= False) #load in data
        
        ###################################################
        # check number of decimals longitude and latitude #
        ###################################################
        str_lat=(pd.Series.tolist(data["decimalLatitude"].astype(str)))
        str_lon=(pd.Series.tolist(data["decimalLongitude"].astype(str)))
        dec_lat=[]
        dec_lon=[]

        for i in range(len(str_lat)):
    
            if "e" in str_lat[i]:
                str_lat[i]="0.00"
                decla = str_lat[i].split(".")[1]
                dec_lat.append(int(len(decla)))
            else:
                decla = str_lat[i].split(".")[1]
                dec_lat.append(int(len(decla)))
                
        for i in range(len(str_lon)):
            declo=str_lon[i].split(".")[1]
            dec_lon.append(int(len(declo)))
    
        data["dec_lat"]=dec_lat
        data["dec_lon"]=dec_lon

        # filter only include those with min. 2 points
        data=data[data["dec_lat"] >= 2]
        data=data[data["dec_lon"] >= 2]
        print("length only including lon-lat 2 decimals",len(data.index))

        data['coordinates'] = list(zip(data["decimalLongitude"], data["decimalLatitude"]))
        data['lonlat'] = list(zip(data["decimalLongitude"], data["decimalLatitude"]))
        data['coordinates'] = data["coordinates"].apply(Point)

        
        #########################################
        # only keep records with unique lon-lat #
        #########################################
        
        data = data.drop_duplicates('lonlat')
        print("length unique lon-lat",len(data.index))

       
        ###############################################
        # only keep records falling in IUNC range #
        ###############################################
        
        #speci=spec.replace("_"," ")
        #dist_shp_spec = dist_shp[dist_shp["LAT"]== "%s"%speci]
        #poly_spec = dist_shp_spec[["LAT"]]
        
         #merge the polygons
        #iucn_poly_spec= poly_spec.
        #Q3 = iucn_poly_spec
        #Q3 #inspect polygon

        #if Q3.is_valid== False:
        #    Q3 = Q3.buffer(0)

        #condition_list=[]

#        for point in data["coordinates"]:
#            output= point.within(poly_spec)
#            condition_list.append(output)

        #keep records that are in species range
#        data["in_dist_polygon"]=condition_list
#        data2=data[data.in_dist_polygon == True]
 #       print("length in species dist polygon",len(data2.index))

        #############################
        # Only keep records > 1900  #
        #############################
        
    #    data['event_date'] = pd.to_datetime(data['event_date']) # set date column to datetime format to extract year
    #    data['year'] = data['event_date'].dt.year
    #    data['month']= data['event_date'].dt.month

        #set date column to datetime format and extract year
    #    data['event_date'] = pd.to_datetime(data['event_date'])
    #    data['year'] = data['event_date'].dt.year
    #    data['month']= data['event_date'].dt.month

        #only include observations >1900
    #    data3=data[data.year >= 1900]
    #    print("length observationas >1900", len(data3.index))
        
        
        # check >10 observations
        if len(data.index)>=10:
            #save to csv
            data.to_csv(file_dir+'/modified_data/gbif_filtered/%s_filtered_data.csv'%spec)
            taxa_list.write(spec+"\n")
            
#close text file
taxa_list.close()
# next species!

processing species Citharichthys_sordidus
length only including lon-lat 2 decimals 27192
length unique lon-lat 6042
processing species Engraulis_mordax
length only including lon-lat 2 decimals 1958
length unique lon-lat 1588
processing species Paralichthys_californicus
length only including lon-lat 2 decimals 415
length unique lon-lat 403
processing species Scomber_japonicus
length only including lon-lat 2 decimals 9777
length unique lon-lat 2783
processing species Thunnus_alalunga
length only including lon-lat 2 decimals 24629
length unique lon-lat 14749
processing species Xiphias_gladius
length only including lon-lat 2 decimals 79872
length unique lon-lat 50678
