In [1]:
!pip install folium



In [2]:
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.functions import col,isnan,when,count, radians, asin, sin, sqrt, cos,min,year,avg
import pandas as pd
import numpy as np
import folium
from folium.features import DivIcon

In [3]:
spark = SparkSession.builder.appName('ML').getOrCreate()
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

In [4]:


ebird_schema = types.StructType([
        types.StructField("speciesCode",types.StringType()),
        types.StructField("comName",types.StringType()),
        types.StructField("sciName",types.StringType()),
        types.StructField("locId",types.StringType()),
        types.StructField("locName",types.StringType()),
        types.StructField("obsDt",types.DateType()),
        types.StructField("howMany",types.StringType()),
        types.StructField("lat",types.StringType()),
        types.StructField("lng",types.StringType()),
        types.StructField("obsValid",types.StringType()),
        types.StructField("obsReviewed",types.StringType()),
        types.StructField("locationPrivate",types.StringType()),
        types.StructField("subId",types.StringType()),
    ])

ebird = spark.read.format('csv').schema(ebird_schema).load("gs://big-data-1-project-storage/cleaned-data/ebird_nonull.csv")


In [5]:
schema = types.StructType([
            types.StructField("speciesCode",types.StringType()),
            types.StructField("comName",types.StringType()),
            types.StructField("sciName",types.StringType()),
            types.StructField("locId",types.StringType()),
            types.StructField("locName",types.StringType()),
            types.StructField("obsDt",types.DateType()),
            types.StructField("howMany",types.StringType()),
            types.StructField("lat",types.FloatType()),
            types.StructField("lng",types.FloatType()),
            types.StructField("ebird_id",types.StringType()),
            types.StructField('station_id', types.StringType()),
            types.StructField('date', types.StringType()),
            types.StructField('PRCP', types.FloatType()),
            types.StructField('SNOW', types.FloatType()),
            types.StructField('SNWD', types.FloatType()),
            types.StructField('TMIN', types.FloatType()),
            types.StructField('TMAX', types.FloatType()),
            types.StructField("Latitude", types.FloatType()),
            types.StructField("Longitude", types.FloatType()),
            types.StructField("Elevation", types.FloatType()),
            types.StructField("State", types.StringType()),
            types.StructField("date_final",types.DateType()),
            types.StructField("min_dist", types.FloatType()),

     
           
        ])


inputs = "gs://big-data-1-project-storage/cleaned-data/joined-data-final"
joined = spark.read.format('csv').schema(schema).load(inputs)
joined = joined.filter( joined['lat'].isNotNull() )

In [6]:
# joined.show()

In [7]:
joined.columns

['speciesCode',
 'comName',
 'sciName',
 'locId',
 'locName',
 'obsDt',
 'howMany',
 'lat',
 'lng',
 'ebird_id',
 'station_id',
 'date',
 'PRCP',
 'SNOW',
 'SNWD',
 'TMIN',
 'TMAX',
 'Latitude',
 'Longitude',
 'Elevation',
 'State',
 'date_final',
 'min_dist']

In [8]:
joined_pandas = joined.toPandas()

In [9]:
def feature_engineering(df):

    df.dropna(inplace=True)
    df.dtypes
    df = df.astype({'howMany': 'int64','lat':'float64','lng':'float64'})
    df['obsDt'] = pd.to_datetime(df['obsDt'])
    df.dtypes
    df['year'] = df['obsDt'].dt.year
    df['month'] = df['obsDt'].dt.month
    df['day'] = df['obsDt'].dt.day
    print( "Count of unique bird species:  " + str(len(df['speciesCode'].unique())))

    conditions = [
        (df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5),
        (df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8),
        (df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11),
        (df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2),
    ]

    values = ['Spring', 'Summer', 'Fall', 'Winter']

    df['seasons'] = np.select(conditions, values)
    
    return df

In [10]:
joined_pandas = feature_engineering(joined_pandas)

Count of unique bird species:  549


In [11]:
grouped_loc_species = joined_pandas.groupby(['speciesCode','year','locId']).sum()['howMany'].reset_index() #ebird_location is my main ebird dataframe
grouped_loc_species.head(2)

Unnamed: 0,speciesCode,year,locId,howMany
0,1441 Granite Road,2019,CA (49.484,3
1,729 Finlayson Arm Road,2019,CA (48.494,1


In [12]:
#grouping by species code to see for each species, which year has the minimum and maximum entry
test = grouped_loc_species.groupby("speciesCode").agg(max_year=('year', np.max),
                                                                   min_year=('year', np.min)).reset_index()

#filtering df to keep recent years data and data where the starting year and ending year has at least 10 years gap
#as any changes in location patterns due to weather would take a few years
test = test[((test['max_year']==2019) | (test['max_year']==2020) |(test['max_year']==2021)) & (test['max_year']-test['min_year']>=10)]

#pivoting the df to make the columns as years, rows as speciesCode and values as count of bird species
moving_birds_year_wise_pivot = joined_pandas[joined_pandas.speciesCode.isin(test.speciesCode.unique())].\
groupby(['speciesCode','year']).sum()['howMany'].reset_index().\
pivot(index='speciesCode',columns='year',values='howMany').reset_index()

#removing rows where there is no value for 2021
moving_birds_year_wise_pivot[~moving_birds_year_wise_pivot[2021].isnull()]

year,speciesCode,1959,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,acowoo,,,,,,,,,,...,3.0,,12.0,,,1.0,,,,2.0
1,aldfly,,,,,,,,,,...,162.0,109.0,154.0,154.0,108.0,103.0,90.0,148.0,141.0,127.0
2,ambduc,,,,,,,,,,...,,12.0,30.0,19.0,16.0,15.0,12.0,1.0,50.0,81.0
3,ameavo,,,,,,,,,8.0,...,129.0,392.0,656.0,533.0,241.0,202.0,16.0,208.0,156.0,348.0
4,amebit,,,,,,1.0,2.0,,,...,59.0,67.0,72.0,92.0,84.0,112.0,39.0,81.0,67.0,56.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415,yebloo,,,,,,,,,,...,20.0,42.0,54.0,33.0,36.0,38.0,31.0,62.0,51.0,21.0
416,yebsap,,,,,,,,,,...,43.0,94.0,79.0,135.0,146.0,141.0,37.0,155.0,175.0,118.0
417,yehbla,,,,,3.0,24.0,14.0,,8.0,...,948.0,1563.0,814.0,998.0,940.0,1451.0,608.0,1265.0,1155.0,1133.0
419,yelwar,1.0,,,9.0,1.0,26.0,13.0,7.0,7.0,...,244.0,200.0,254.0,247.0,260.0,274.0,130.0,190.0,294.0,190.0


In [13]:
#let's look at one particular bird species and check their location in most recent year and in previous year
grouped_loc_species[(grouped_loc_species['speciesCode']=='whrsan') & ((grouped_loc_species['year']==2012) | (grouped_loc_species['year']==2021))]

Unnamed: 0,speciesCode,year,locId,howMany
581863,whrsan,2012,L1003336,3
581876,whrsan,2021,L341490,41
581877,whrsan,2021,L348443,2
581878,whrsan,2021,L853071,1


In [14]:
#getting the location, year, speciesCode, weather details for this particular bird species
ebird_plotting_location = joined_pandas.groupby(['speciesCode','year','locId','lat','lng']).agg({'howMany': 'sum', 'SNOW': 'median'}).reset_index()
ebird_plotting_location = ebird_plotting_location[(ebird_plotting_location['speciesCode']=='whrsan') & ((ebird_plotting_location['year']==2012) | (ebird_plotting_location['year']==2021))]

In [15]:
ebird_plotting_location = ebird_plotting_location.sort_values(by=['howMany','locId'],ascending=False).head(2)
ebird_plotting_location

Unnamed: 0,speciesCode,year,locId,lat,lng,howMany,SNOW
581876,whrsan,2021,L341490,56.270557,-120.776665,41,6.5
581863,whrsan,2012,L1003336,49.57523,-115.662537,3,11.8


In [16]:
#it's a summer bird and from above df we can see it moved from area with more snowfall to area with less snowfall
joined_pandas[joined_pandas['speciesCode']=='whrsan'].seasons.value_counts(normalize=True) 

Summer    0.627119
Spring    0.288136
Fall      0.084746
Name: seasons, dtype: float64

In [17]:
joined_pandas[(joined_pandas['speciesCode']=='whrsan') & 
              ((joined_pandas['year']==2012) | (joined_pandas['year']==2021)) & 
              ((joined_pandas['locId']=='L341490') | (joined_pandas['locId']=='L1003336'))
             ].groupby(['year','month']).sum()['howMany']

year  month
2012  8         3
2021  6        41
Name: howMany, dtype: int64

In [18]:
bc_map = folium.Map(location=[50,-120],zoom_start=7)
loc = 'White-rumped Sandpiper(whrsan)'
title_html = '''
             <h3 align="center" style="font-size:16px"><b>{}</b></h3>
             '''.format(loc)
bc_map.get_root().html.add_child(folium.Element(title_html))
color_map = { 2012: 'red', 2021: 'green'}

for yr, loc, lat, lng,count,snow in zip(ebird_plotting_location['year'], 
                                   ebird_plotting_location['locId'], 
                                   ebird_plotting_location['lat'], 
                                   ebird_plotting_location['lng'],
                                   ebird_plotting_location['howMany'],
                                   ebird_plotting_location['SNOW']
                                  ):
    label = folium.Popup(str(yr) + ', Count:' + str(count) + ', Snow(mm):' + str(snow), parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius = count*1.5,
        popup = label,
        color = color_map[yr],
        fill = True,
        fill_color = color_map[yr],
        fill_opacity = 0.7).add_to(bc_map)
    
#     folium.map.Marker([lat, lng],
#                       icon=DivIcon(
#                           icon_size=(30,30),
#                           icon_anchor=(5,14),
#                           html=f'<div style="font-size: 12pt">%s</div>' % str(yr),
#                       )
#                      ).add_to(bc_map)
bc_map