In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt
import plotly.express as px 

PFW_2021_public = pd.read_csv('/Users/bkirton/Desktop/python/ipynb/tidy_tuesday/data_sources/PFW_2021_public.csv')
PFW_2021_public.head()

# There are almost three million rows! The file is too big for github, let's subsample.
np.random.seed(424242)
PFW_2021_public_subset = PFW_2021_public.sample(n=100000)




In [None]:
#get df info
PFW_2021_public_subset.info()

In [None]:
# Check for missing values in the dataframe
missing_values = PFW_2021_public_subset.isna().sum()
print(missing_values)

In [None]:


# group the dataframe by latitude and longitude and count the occurrences of each pair
freq = PFW_2021_public_subset.groupby(['latitude', 'longitude']).size().reset_index(name='count')

# filter the dataframe to show only sites with more than 10 frequencies
freq = freq[freq['count'] > 15]

# create the map using Plotly Express
fig = px.scatter_mapbox(freq, lat="latitude", lon="longitude", hover_name="count", zoom=8, color='count', size='count')

# update the map style and layout
fig.update_layout(
    mapbox_style="open-street-map",
    margin={"r":0,"t":0,"l":0,"b":0}
)

# show the map
fig.show()

In [None]:
# Group data by state and count the number of occurrences for each state
state_counts = PFW_2021_public_subset.groupby("subnational1_code").size().reset_index(name="count")

# Sort data in descending order by the count
state_counts = state_counts.sort_values(by="count", ascending=False)

fig = px.bar(state_counts, x = 'subnational1_code', y = 'count', title = 'Bird Watching Popularity by State', labels = {'subnational1_code': 'State', 'count': 'Number of Sightings'})

fig.update_layout(xaxis_tickangle=-45)

fig.show()

In [None]:
#time series analysis to show trends in bird sightings over time 

# Group the data by year and month and count the number of sightings
sightings_by_month = PFW_2021_public_subset.groupby(["Year", "Month"])["obs_id"].count()

# Reset the index and rename the columns
sightings_by_month = sightings_by_month.reset_index()
sightings_by_month.columns = ["Year", "Month", "Sightings"]

# Create the line plot
fig = px.line(sightings_by_month, x="Month", y="Sightings", color="Year", title="Bird Sightings by Month and Year")
fig.show()

In [None]:
#species diversity: number of unique species observed in each state 

#drop xx- and pm- from subnational1_code
PFW_2021_public_subset = PFW_2021_public_subset[~PFW_2021_public_subset['subnational1_code'].isin(['XX-', 'PM-'])]

# Get the number of unique species observed in each state
species_counts = PFW_2021_public_subset.groupby("subnational1_code")["species_code"].nunique().reset_index()
species_counts = species_counts.rename(columns={"species_code": "Species Count",'subnational1_code':'state' })
species_counts_sorted = species_counts.sort_values(by = 'Species Count', ascending=False)


# Create a bar chart of the species counts by state
fig = px.bar(species_counts_sorted, x="state", y="Species Count", title="Number of Unique Bird Species Observed by State", color = 'Species Count' )
fig.show()

In [None]:
#Bird density: You can calculate the number of bird sightings per square mile for each state and create a choropleth map to see which areas have the highest bird densities.

#Habitat preferences: You can investigate which habitats are preferred by different bird species. For example, you could create scatter plots of bird sightings against vegetation density or water availability.

#Migration patterns: You can identify which bird species are migratory and create maps to show their migration routes.

#need to download the site description field details csv to do the above

In [None]:
# Filter out Canadian states and remove "US-" prefix from US state abbreviations
PFW_2021_public_subset = PFW_2021_public_subset[PFW_2021_public_subset["subnational1_code"].str.startswith("US-")]
PFW_2021_public_subset["subnational1_code"] = PFW_2021_public_subset["subnational1_code"].str.replace("US-", "")

# Group the data by state and count the number of sightings
sightings_by_state = PFW_2021_public_subset.groupby("subnational1_code")["obs_id"].count().reset_index()
sightings_by_state = sightings_by_state.rename(columns={"obs_id": "Sightings", "subnational1_code": "State"})

# Create the choropleth map
fig = px.choropleth(sightings_by_state, locations="State", locationmode="USA-states", color="Sightings",
                    scope="usa", title="Bird Sightings by State")
fig.show()

In [None]:
# Get the number of unique species observed in each state
species_counts = PFW_2021_public_subset.groupby("subnational1_code")["species_code"].nunique().reset_index()
species_counts = species_counts.rename(columns={"species_code": "Species Count",'subnational1_code':'state' })

# Create the choropleth map
fig = px.choropleth(species_counts, locations="state", locationmode="USA-states", color="Species Count",
                    scope="usa", title="Number of Unique Bird Species Observed by State")
fig.show()

In [None]:
#subset grackle data
grackle_df = PFW_2021_public[PFW_2021_public['species_code'].str.contains('gra')]

In [None]:
# Calculate the total number of grackle sightings
total_sightings = len(grackle_df)

# Calculate the average latitude and longitude of grackle sightings
avg_latitude = grackle_df['latitude'].mean()
avg_longitude = grackle_df['longitude'].mean()

# Print the summary statistics
print("Total Grackle Sightings:", total_sightings)
print("Average Latitude:", avg_latitude)
print("Average Longitude:", avg_longitude)

In [None]:
# Group grackle sightings by state and count the occurrences
state_sightings = grackle_df.groupby('subnational1_code')['obs_id'].count().reset_index()
state_sightings.columns = ['State', 'Grackle Sightings']

# Create the choropleth map
fig = px.choropleth(state_sightings, locations='State', locationmode='USA-states', color='Grackle Sightings',
                    scope='usa', title='Grackle Sightings by State')

# Show the map
fig.show()

In [None]:
# Combine day, month, and year columns into a date column
grackle_df['Date'] = pd.to_datetime(grackle_df[['Year', 'Month', 'Day']])

# Convert the Date column to string format
grackle_df['Date'] = grackle_df['Date'].dt.strftime('%Y-%m-%d')

# Sort the DataFrame by date
grackle_df_sorted = grackle_df.sort_values(by='Date')

# Create an animated scatter plot
fig = px.scatter_mapbox(grackle_df_sorted, lat="latitude", lon="longitude", animation_frame="Date",
                        hover_name="obs_id", color_continuous_scale="Viridis",
                        title="Grackle Movement")

# Set mapbox style and center the map
fig.update_layout(mapbox_style="carto-positron", mapbox_center={"lat": grackle_df['latitude'].mean(),
                                                               "lon": grackle_df['longitude'].mean()},
                  mapbox_zoom=4)

# Show the plot
fig.show()

In [None]:
# Filter grackle sightings for Texas
grackle_df_texas = grackle_df[grackle_df['subnational1_code'] == 'US-TX']

# Create a scatter plot
fig = px.scatter_mapbox(grackle_df_texas, lat="latitude", lon="longitude", color="Date",
                        hover_name="obs_id", color_continuous_scale="Viridis",
                        title="Grackle Sightings in Texas")
fig.update_layout(mapbox_style="carto-positron", mapbox_zoom=6, mapbox_center={"lat": 31.9686, "lon": -99.9018})

# Show the plot
fig.show()

In [None]:
# Filter grackle sightings for Texas
grackle_df_texas = grackle_df[grackle_df['subnational1_code'] == 'US-TX']

# Create a density mapbox plot
fig = px.density_mapbox(grackle_df_texas, lat="latitude", lon="longitude", radius=10,
                        center=dict(lat=31.9686, lon=-99.9018), zoom=6,
                        mapbox_style="carto-positron", title="Grackle Sighting Density in Texas")

# Show the plot
fig.show()

In [None]:

# Sort grackle sightings by date
grackle_df_sorted = grackle_df.sort_values('Date')

# Create an animated scatter plot
fig = px.scatter_mapbox(grackle_df_sorted, lat="latitude", lon="longitude", color="Date",
                        hover_name="obs_id", animation_frame="Date",
                        color_continuous_scale="Viridis", range_color=[grackle_df_sorted['Date'].min(), grackle_df_sorted['Date'].max()],
                        title="Grackle Migrations")
fig.update_layout(mapbox_style="carto-positron", mapbox_zoom=6, mapbox_center={"lat": 31.9686, "lon": -99.9018})

# Show the plot
fig.show()

In [None]:
# view grackle sightings by species 

grackle_species = grackle_df['species_code'].unique().tolist()

# Create a separate animated plot for each grackle species
for species in grackle_species:
    # Filter grackle sightings for the current species
    species_df = grackle_df[grackle_df['species_code'] == species]

    # Sort sightings by date
    species_df_sorted = species_df.sort_values('Date')

    # Create an animated scatter plot for the current species
    fig = px.scatter_mapbox(species_df_sorted, lat="latitude", lon="longitude", color="Date",
                            hover_name="obs_id", animation_frame="Date",
                            color_continuous_scale="Viridis",
                            title=f"{species} Migrations")
    fig.update_layout(mapbox_style="carto-positron", mapbox_zoom=6, mapbox_center={"lat": 31.9686, "lon": -99.9018})

    # Show the plot
    fig.show()

In [None]:
fig = px.scatter_mapbox(grackle_df, lat="latitude", lon="longitude", color="species_code",
                        hover_name="obs_id", animation_frame="Date",
                        color_discrete_sequence=px.colors.qualitative.Set1,
                        title="Grackle Species Migrations")
fig.update_layout(mapbox_style="carto-positron", mapbox_zoom=6, mapbox_center={"lat": 31.9686, "lon": -99.9018})

# Show the plot
fig.show()

In [None]:
grackle_species = grackle_df['species_code'].unique().tolist()

In [None]:
grackle_species