In [None]:
import os
import pandas as pd
import helper
from datetime import date
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Upload wildlife_newdata

In [None]:
cols_date = ["entered_date", "date_time"]
newdata = helper.upload_csv("cleaned_wildlife_newdata.csv", cols_date)

In [None]:
newdata.info()

# Check for duplicates

In [None]:
# cols_to_check = ["image_name", "date_time"]
# dfn = (newdata[newdata.duplicated(subset=cols_to_check, keep=False)]
#        .iloc[:,0:18]
#        .sort_values(by=cols_to_check)
#        .tail(51)
# )
# dfn

## is record_number unique? no
## is image_name unique? no
## is image_name, date_time unique? no

In [None]:
# make image_date and date_time as primary key
cols_to_check = ["image_name", "date_time"]
newdata = newdata.drop_duplicates(cols_to_check, ignore_index=True)

# Subset wildlife data

In [None]:
cols = [
    "image_name",
    "date_time",
    "temperature",
    "moon_phase",
    "species_category",
    "species_category_2",
    "carnivore",
    "herbivore",
    "insectivore",
    "omnivore",
    "bird",
]
newdata = newdata[cols]

In [None]:
## check whether "species_category" is reliable to identify animal types
# newdata[newdata["species_category"] == "carnivore"]["herbivore"].value_counts()

In [None]:
# check whether there are images that identify more than one type of animals
# carni = newdata[~newdata["carnivore"].isnull()]
# carni[~carni["herbivore"].isnull()]

In [None]:
newdata = newdata.drop(columns=["species_category", "species_category_2"])

In [None]:
newdata = newdata.dropna(subset=["date_time"])

In [None]:
newdata["image_date"] = newdata["date_time"].dt.date
newdata["image_hr"] = newdata["date_time"].dt.hour
newdata["image_yr"] = newdata["date_time"].dt.year

In [None]:
newdata = newdata[~(newdata["image_date"] == date(1900, 1, 7))]
newdata["image_date"] = pd.to_datetime(newdata["image_date"])
newdata = newdata[~(newdata["image_yr"] == 2029)]

In [None]:
newdata

# Reshape data

In [None]:
newdata_tidy = newdata.copy()

In [None]:
def classify_animals(row):
    if not pd.isna(row["carnivore"]):
        return "carnivore"
    elif not pd.isna(row["herbivore"]):
        return "herbivore"
    elif not pd.isna(row["insectivore"]):
        return "insectivore"
    elif not pd.isna(row["omnivore"]):
        return "omnivore"
    elif not pd.isna(row["bird"]):
        return "bird"
    return pd.NA


def map_animals(row):
    if not pd.isna(row["carnivore"]):
        return row["carnivore"]
    elif not pd.isna(row["herbivore"]):
        return row["herbivore"]
    elif not pd.isna(row["insectivore"]):
        return row["insectivore"]
    elif not pd.isna(row["omnivore"]):
        return row["omnivore"]
    elif not pd.isna(row["bird"]):
        return row["bird"]
    return pd.NA

In [None]:
newdata_tidy["anim_type"] = newdata_tidy.apply(
    lambda row: classify_animals(row), axis=1
)
newdata_tidy["anim_spotted"] = newdata_tidy.apply(lambda row: map_animals(row), axis=1)

In [None]:
# newdata[newdata["anim_spotted"].isna()]
newdata_tidy = newdata_tidy.dropna(subset="anim_spotted")
cols_to_drop = ["carnivore", "herbivore", "insectivore", "omnivore", "bird"]
newdata_tidy = newdata_tidy.drop(columns=cols_to_drop)
newdata_tidy

In [None]:
newdata_tidy["anim_spotted"] = newdata_tidy["anim_spotted"].apply(
    lambda x: x[4:].replace("spotted", "").strip()
)

In [None]:
newdata_tidy.head()

## Export tidy data

In [None]:
SAVE_DIR = "../data"
os.makedirs(SAVE_DIR, exist_ok=True)
FILE_NAME = "tidy_wildlife_newdata.csv"
path = os.path.join(SAVE_DIR, FILE_NAME)
newdata_tidy.to_csv(path, index=False, date_format="%Y-%m-%d %H:%M:%S")

# Animal activities: Overview

In [None]:
sns.set_style("whitegrid")

In [None]:
## animal activities for all years, subset by animal types
subset_anim = ["carnivore", "herbivore", "omnivore"]
g = sns.catplot(
    data=newdata_tidy.query("anim_type in @subset_anim"),
    kind="count",
    y="anim_spotted",
    col="anim_type",
    col_wrap=2,
    sharey=False,
    color="tab:olive",
)
# g.set_xticklabels(rotation=90)

In [None]:
## animal activities, subset by years and animal types
subset_anim = ["carnivore", "herbivore", "omnivore"]
subset_year = [2020, 2021, 2022, 2019]
g = sns.catplot(
    data=newdata_tidy.query("anim_type in @subset_anim and image_yr in @subset_year"),
    kind="count",
    y="anim_spotted",
    col="anim_type",
    col_wrap=2,
    sharey=False,
    sharex=False,
    color="orange",
)

# Animal activities: Comparison

In [None]:
## animal activities, compare two years, subset by animal types
subset_anim = ["carnivore", "herbivore", "omnivore"]
subset_year = [2020, 2019]
g = sns.catplot(
    data=newdata_tidy.query("anim_type in @subset_anim and image_yr in @subset_year"),
    kind="count",
    y="anim_spotted",
    col="anim_type",
    col_wrap=2,
    sharey=False,
    sharex=False,
    hue="image_yr",
    palette=sns.color_palette("colorblind", n_colors=2),
)

# Animal activities by moon phase

## Generate df to download

In [None]:
subset_anim = ["herbivore"]
df = (
    newdata_tidy.query("anim_type in @subset_anim")
    .groupby("moon_phase")["anim_spotted"]
    .value_counts()
    .to_frame()
)
df_tidy = df.reset_index()
df_tidy.head()

In [None]:
g = sns.catplot(
    data=df_tidy,
    x="count",
    y="moon_phase",
    hue="anim_spotted",
    palette=sns.color_palette("colorblind"),
)

In [None]:
def anim_df_to_download(df_raw, sub_anim, sub_yr, group):
    anim = sub_anim
    yr = sub_yr
    df = (
        df_raw.query("anim_type in @anim and image_yr in @yr")
        .groupby(group)["anim_spotted"]
        .value_counts()
        .to_frame()
    )
    df_tidy = df.reset_index()
    return df_tidy

In [None]:
subset_anim = ["herbivore"]
subset_year = [2020, 2019, 2021, 2022]
df = anim_df_to_download(newdata_tidy, subset_anim, subset_year, group="moon_phase")
df.head(15)

In [None]:
# subset_anim = ["herbivore"]
# subset_year = [2020, 2019, 2021]
# df = newdata_tidy.query('anim_type in @subset_anim and image_yr in @subset_year').groupby("moon_phase")["anim_spotted"].value_counts().to_frame()
# df_tidy = df.reset_index()
# df_tidy.head(25)

In [None]:
g = sns.catplot(
    data=df,
    x="count",
    y="moon_phase",
    hue="anim_spotted",
    palette=sns.color_palette("colorblind"),
)

## Generate heatmap

In [None]:
subset_anim = ["herbivore"]
subset_year = [2020, 2019, 2021, 2022]
df = newdata_tidy.query("anim_type in @subset_anim and image_yr in @subset_year")
df_crosstab = pd.crosstab(df["anim_spotted"], df["moon_phase"])
sns.heatmap(df_crosstab, annot=True, fmt="d", cmap="YlGnBu", linewidths=0.15)

# Animal activities by time of day

## Generate df to download

In [None]:
subset_anim = ["herbivore"]
subset_year = [2020, 2019, 2021]
df = anim_df_to_download(newdata_tidy, subset_anim, subset_year, group="image_hr")
df.head()

In [None]:
subset_anim = ["herbivore", "carnivore", "omnivore"]
subset_year = [2020, 2019, 2021]
df = (
    newdata_tidy.query("anim_type in @subset_anim and image_yr in @subset_year")
    .groupby(["image_hr"])["anim_type"]
    .value_counts()
    .to_frame()
)
df_tidy = df.reset_index()
df_tidy.head()

In [None]:
sns.set_style("ticks")
g = sns.relplot(
    data=df_tidy,
    x="image_hr",
    y="count",
    kind="line",
    hue="anim_type",
    style="anim_type",
)

# Animal activities by date

In [None]:
subset_anim = ["herbivore", "carnivore", "omnivore"]
subset_year = [2020]
df = (
    newdata_tidy.query("anim_type in @subset_anim and image_yr in @subset_year")
    .groupby(["image_date"])["anim_type"]
    .value_counts()
    .to_frame()
)
df_tidy = df.reset_index()
g = sns.relplot(
    data=df_tidy,
    x="image_date",
    y="count",
    kind="scatter",
    markers=True,
    alpha=0.5,
    col="anim_type",
    hue="anim_type",
    facet_kws=dict(sharey=False),
)
(
    g.set_axis_labels("Date", "Count of species spotted").set_titles(
        "Animal type: {col_name}"
    )
)

# Animal activities by temperature

In [None]:
subset_anim = ["herbivore", "carnivore", "omnivore"]
subset_year = [2019, 2020, 2021]
df = (
    newdata_tidy.query("anim_type in @subset_anim and image_yr in @subset_year")
    .groupby(["temperature"])["anim_type"]
    .value_counts()
    .to_frame()
    .reset_index()
)
df

In [None]:
g = sns.relplot(
    data=df,
    x="temperature",
    y="count",
    kind="scatter",
    hue="anim_type",
    col="anim_type",
    col_wrap=2,
    palette=sns.color_palette("colorblind"),
    legend=False,
)
(
    g.set_axis_labels("Temperature (Celcius)", "Count of species spotted").set_titles(
        "Animal type: {col_name}"
    )
)