In [None]:
import os
import pandas as pd
import numpy as np
import helper
import datetime

# Upload wildlife passage data

In [None]:
FILE_NAME = "GR merged  CT All with OC Master PDRT  Camera Trap Data Sheet.xlsx"
FILE_PATH = os.path.join("..", "data", FILE_NAME)
SHEET_NAMES = ["Newdata", "17-22"]
wildlife_raw = pd.read_excel(FILE_PATH, sheet_name=SHEET_NAMES)

In [None]:
# extract datasets
wl_newdata_raw = wildlife_raw["Newdata"]
wl_1722_raw = wildlife_raw["17-22"]

# Clean *newdata*

In [None]:
# wl_newdata_raw.tail()

## Remove irrelevant columns

In [None]:
# wl_newdata_raw.columns

In [None]:
# check for empty columns
# wl_newdata_raw.isnull().sum()

In [None]:
# wl_newdata_raw.shape

In [None]:
# drop columns with prefix Unnamed
wl_newdata_raw = helper.drop_cols_with_prefix(wl_newdata_raw, "Unnamed:")

In [None]:
# drop empty columns
wl_newdata_raw = wl_newdata_raw.dropna(axis=1, how="all")

## Clean up column names

In [None]:
# remove special character and trailing spaces in column names
wl_newdata_raw = helper.remove_spaces_in_colnames(wl_newdata_raw)

In [None]:
# wl_newdata_raw.columns

In [None]:
col_renames = {
    "Student/Volunteer": "student_or_volunteer",
    "open/closed": "open_or_closed",
    "DateTime": "date_time",
    "AM,PM,ML": "am_pm_ml",
    "Species Category2": "species_category_2",
    "Checker Initials & Date": "checker_initials_and_date",
    "Non-Animal": "non_animal",
}

In [None]:
wl_newdata_raw = wl_newdata_raw.rename(columns=col_renames)

In [None]:
wl_newdata_raw = helper.snake_style_colnames(wl_newdata_raw)

In [None]:
# wl_newdata_raw.columns

## Cleanup by columns

In [None]:
# wl_newdata_raw.info()

### col: record_number

In [None]:
wl_newdata_raw["record_number"] = wl_newdata_raw["record_number"].astype("Int64")

In [None]:
# wl_newdata_raw['record_number']

In [None]:
# helper.check_range(wl_newdata_raw['record_number'])

In [None]:
# helper.check_missing_values(wl_newdata_raw['record_number'])

In [None]:
# helper.find_duplicate_rows(wl_newdata_raw,['record_number'])

In [None]:
# record_number_dups = helper.find_duplicate_rows(wl_newdata_raw[['record_number']])
# record_number_dups.sort_values(by='record_number').head(20)

In [None]:
# take a closer look at one of duplicates record number
# wl_newdata_raw[wl_newdata_raw['record_number'] == 1]

### col: camera_trap

In [None]:
# check value consistency
# wl_newdata_raw.camera_trap.unique()

In [None]:
# helper.check_missing_values(wl_newdata_raw['camera_trap'])

In [None]:
wl_newdata_raw["camera_trap"] = wl_newdata_raw["camera_trap"].apply(
    lambda x: helper.clean_camera_trap(x)
)

In [None]:
# wl_newdata_raw['camera_trap'].unique()

### col: entered_date

In [None]:
# wl_newdata_raw.entered_date.info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['entered_date'])

In [None]:
# wl_newdata_raw['entered_date'].unique()

In [None]:
wl_newdata_raw["entered_date"] = wl_newdata_raw["entered_date"].apply(
    lambda x: helper.remove_nondatetime(x)
)

In [None]:
# wl_newdata_raw['entered_date'].info() # recheck

### col: student_or_volunteer

In [None]:
# wl_newdata_raw['student_or_volunteer'].info()

In [None]:
# wl_newdata_raw['student_or_volunteer'].unique()

In [None]:
wl_newdata_raw["student_or_volunteer"] = (
    wl_newdata_raw["student_or_volunteer"].str.strip().str.upper()
)

names_mapping = {
    "EMELIA": "EMELIA NGWENYA",
    "MACSTALIYN": "MACSTALIYN TABUYA",
    "MACSTALIYN T": "MACSTALIYN TABUYA",
    "JONATHAN": "JONATHAN MTETWA",
}

wl_newdata_raw["student_or_volunteer"] = wl_newdata_raw["student_or_volunteer"].replace(
    names_mapping
)

### col: first_image

In [None]:
# wl_newdata_raw['first_image'].info()

In [None]:
# wl_newdata_raw['first_image'].unique()

In [None]:
wl_newdata_raw["first_image"] = wl_newdata_raw["first_image"].str.strip()

fimage_mapping = {"None": np.nan, "none": np.nan}

wl_newdata_raw["first_image"] = wl_newdata_raw["first_image"].replace(fimage_mapping)

### col: open_or_closed

In [None]:
# wl_newdata_raw['open_or_closed'].info()

In [None]:
# wl_newdata_raw['open_or_closed'].unique()

In [None]:
wl_newdata_raw["open_or_closed"] = (
    wl_newdata_raw["open_or_closed"].str.strip().str.upper()
)

### col: image_name

In [None]:
# wl_newdata_raw['image_name'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['image_name'])

In [None]:
# wl_newdata_raw['image_name'].tail(25)

### cols: time_observation and date_observation

`time_observation` and `date_observation` are derived from `date_time`. Therefore we keep only the `date_time` column.

In [None]:
wl_newdata_raw = wl_newdata_raw.drop(columns=["time_observation", "date_observation"])

### col: date_time

In [None]:
# wl_newdata_raw['date_time'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['date_time'])

In [None]:
wl_newdata_raw["date_time"] = wl_newdata_raw["date_time"].apply(
    lambda x: helper.remove_nondatetime(x)
)

In [None]:
# wl_newdata_raw['date_time'].info()

### col: am_pm_ml

In [None]:
# wl_newdata_raw['am_pm_ml'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['am_pm_ml'])

In [None]:
# wl_newdata_raw['am_pm_ml'].unique()

In [None]:
values_mapping = {
    3: "3",
    2: "2",
    "LM": "ML",
}

wl_newdata_raw["am_pm_ml"] = wl_newdata_raw["am_pm_ml"].replace(values_mapping)

wl_newdata_raw["am_pm_ml"] = wl_newdata_raw["am_pm_ml"].str.strip().str.upper()

### col: temperature

In [None]:
# wl_newdata_raw['temperature'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['temperature'])

In [None]:
# wl_newdata_raw['temperature'].unique()

In [None]:
wl_newdata_raw["temperature"] = wl_newdata_raw["temperature"].apply(
    lambda x: helper.clean_temperature(x)
)

In [None]:
# wl_newdata_raw['temperature'].info()

In [None]:
# helper.check_range(wl_newdata_raw['temperature'])

### col: moon_phase

In [None]:
# wl_newdata_raw['moon_phase'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['moon_phase'])

In [None]:
# wl_newdata_raw['moon_phase'].unique()

In [None]:
wl_newdata_raw["moon_phase"] = wl_newdata_raw["moon_phase"].apply(
    lambda x: helper.clean_moon_phase(x)
)
values_mapping = {
    "WAXING CRESENT": "WAXING CRESCENT",
    "WANNING CRESCENT": "WANING CRESCENT",
}
wl_newdata_raw["moon_phase"] = wl_newdata_raw["moon_phase"].replace(values_mapping)

### col: species_category

In [None]:
# wl_newdata_raw['species_category'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['species_category'])

In [None]:
# wl_newdata_raw['species_category'].unique()

In [None]:
wl_newdata_raw["species_category"] = wl_newdata_raw["species_category"].apply(
    lambda x: helper.clean_species_category(x)
)

### col: species_category_2

In [None]:
# wl_newdata_raw['species_category_2'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['species_category_2'])

In [None]:
# wl_newdata_raw['species_category_2'].unique()

In [None]:
wl_newdata_raw["species_category_2"] = wl_newdata_raw["species_category_2"].apply(
    lambda x: helper.clean_species_category(x)
)

### col: carnivore

In [None]:
# wl_newdata_raw['carnivore'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['carnivore'])

In [None]:
# wl_newdata_raw['carnivore'].unique()

In [None]:
wl_newdata_raw["carnivore"] = wl_newdata_raw["carnivore"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)

### col: herbivore

In [None]:
# wl_newdata_raw['herbivore'].info()

In [None]:
# helper.check_missing_values(wl_newdata_raw['herbivore'])

In [None]:
# wl_newdata_raw['herbivore'].unique()

In [None]:
wl_newdata_raw["herbivore"] = wl_newdata_raw["herbivore"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)

### col: insectivore

In [None]:
# helper.check_missing_values(wl_newdata_raw['insectivore'])

In [None]:
# wl_newdata_raw['insectivore'].unique()

In [None]:
wl_newdata_raw["insectivore"] = wl_newdata_raw["insectivore"].apply(
    lambda x: helper.clean_insectivore(x)
)

### col: ominvore

In [None]:
# helper.check_missing_values(wl_newdata_raw['omnivore'])

In [None]:
# wl_newdata_raw['omnivore'].unique()

In [None]:
wl_newdata_raw["omnivore"] = wl_newdata_raw["omnivore"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)

### col: bird

In [None]:
# helper.check_missing_values(wl_newdata_raw['bird'])

In [None]:
# wl_newdata_raw['bird'].unique()

In [None]:
wl_newdata_raw["bird"] = wl_newdata_raw["bird"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)

### col: non_animal

In [None]:
# helper.check_missing_values(wl_newdata_raw['non_animal'])

In [None]:
# wl_newdata_raw['non_animal'].unique()

In [None]:
wl_newdata_raw["non_animal"] = wl_newdata_raw["non_animal"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)
wl_newdata_raw["non_animal"] = wl_newdata_raw["non_animal"].replace(
    {"602=vehicle": "602=vehicles"}
)

### col: license_Plate

In [None]:
# helper.check_missing_values(wl_newdata_raw['license_plate'])

In [None]:
# wl_newdata_raw['license_plate'].unique()

### col: unknown

In [None]:
# helper.check_missing_values(wl_newdata_raw['unknown'])

In [None]:
# wl_newdata_raw['unknown'].unique()

### col: other

In [None]:
# helper.check_missing_values(wl_newdata_raw['other'])

In [None]:
# wl_newdata_raw['other'].unique()

In [None]:
wl_newdata_raw["other"] = wl_newdata_raw["other"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)

### col: unidentified animal

In [None]:
# helper.check_missing_values(wl_newdata_raw['unidentified_animal'])

In [None]:
# wl_newdata_raw['unidentified_animal'].unique()

In [None]:
wl_newdata_raw["unidentified_animal"] = wl_newdata_raw["unidentified_animal"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)

### col: comments

In [None]:
# helper.check_missing_values(wl_newdata_raw['comments'])

In [None]:
wl_newdata_raw["comments"] = wl_newdata_raw["comments"].apply(
    lambda x: helper.strip_lower_equalsign(x)
)

### col: checker_initials_and_date

In [None]:
wl_newdata_raw["checker_initials_and_date"] = wl_newdata_raw[
    "checker_initials_and_date"
].apply(lambda x: helper.strip_lower_equalsign(x))

### species processed columns

In [None]:
processed_cols = ["cid", "hid", "iid", "oid", "bid"]
wl_newdata_cleaned = wl_newdata_raw.drop(columns=processed_cols)

## Export cleaned newdata

In [None]:
SAVE_DIR = "../data"
os.makedirs(SAVE_DIR, exist_ok=True)
FILE_NAME = "cleaned_wildlife_newdata.csv"
path = os.path.join(SAVE_DIR, FILE_NAME)
wl_newdata_cleaned.to_csv(path, index=False)

# Clean _1722_ data

In [None]:
# wl_1722_raw.head()

In [None]:
# wl_1722_raw.tail()

In [None]:
# remove processed rows
wl_1722_raw = wl_1722_raw.iloc[:-3, :]

In [None]:
# drop columns with prefix Unnamed
wl_1722_raw = helper.drop_cols_with_prefix(wl_1722_raw, "Unnamed:")

In [None]:
# drop empty columns
wl_1722_cleaned = wl_1722_raw.dropna(axis=1, how="all")

## Export cleaned 1722

In [None]:
# SAVE_DIR = "../data"
# os.makedirs(SAVE_DIR, exist_ok=True)
FILE_NAME = "cleaned_wildlife_1722.csv"
path = os.path.join(SAVE_DIR, FILE_NAME)
wl_1722_cleaned.to_csv(path, index=False)