In [175]:
import pandas as pd

In [176]:
# Directories
dataset_dir = "../datasets"
feed_dataset_dir = "../feed-datasets"

In [177]:
# Utility functions
def get_season(date):
    """
    Get the season for a given date.

    Parameters:
    - date: datetime object

    Returns:
    - season: str
    """
    if pd.isnull(date):
        return None
    
    month = date.month

    if 3 <= month <= 5:
        return 'spring'
    elif 6 <= month <= 8:
        return 'summer'
    elif 9 <= month <= 11:
        return 'autumn'
    else:
        return 'winter'
    
def get_time_of_day(date):
    """
    Determines if it's day or night (0, 1)

    Parameters:
    - date: datetime object

    Returns:
    - time_day: str
    """
    if pd.isnull(date):
        return 'night'

    hour = date.hour

    day_threshold = 6 
    night_threshold = 22

    if day_threshold < hour < night_threshold:
        return 'day'
    else:
        return 'night'
  
def temperature_label(temperature):
    """
    Get temperature range for a given temperature.

    Parameters:
    - temperature: float

    Returns:
    - temperature_label: str
    """
    if pd.isnull(temperature):
        return None
    
    if temperature < -10:
        return "extremely_cold"
    elif -10 <= temperature < 0:
        return "cold"
    elif 0 <= temperature < 5:
        return "cool"
    elif 5 <= temperature < 10:
        return "slightly_warm"
    elif 10 <= temperature < 15:
        return "moderate"
    elif 15 <= temperature < 20:
        return "warm"
    elif 20 <= temperature < 25:
        return "very_warm"
    elif 25 <= temperature < 30:
        return "hot"
    elif 30 <= temperature < 35:
        return "very_hot"
    elif 35 <= temperature < 40:
        return "extremely_hot"
    else:
        return "exceptionally_hot"
    
def classify_animal(animal):
    groups = {
        # Mammals
        # "mammal": ["porcupine", "wild boar", "marten", "badger", "fox", "wolf", "dog", "cat", "hare", "deer", "horse", "squirrel"],
        
        # Birds
        "bird": ["buzzard", "heron", "mallard"],
        
        # Water animals
        "water_animal": ["mallard", "heron"],
        
        # Predatory animals
        "predatory_animal": ["fox", "wolf", "buzzard"],
        
        # Domestic animals
        "domestic_animal": ["dog", "cat", "horse"],
        
        # Herbivores
        "herbivore": ["porcupine", "wild boar", "hare", "deer", "horse", "squirrel"],
        
        # Night Animals
        "night_animal": ["fox", "wolf", "hare", "badger"],
        
        # Size Classification
        "size": ["xs", "s", "m", "l"]
    }
    
    # Define size boundaries
    size_boundaries = {
        "xs": ["mallard", "buzzard", "porcupine", "marten", "cat", "hare", "squirrel"],
        "s": ["fox", "cat", "badger"],
        "m": ["heron", "dog", "wild boar","deer", "wolf"],
        "l": ["horse"]
    }
    
     # Classify based on size
    size_group = next((size for size, animals in size_boundaries.items() if animal.lower() in animals), None)
    
    # Classify based on other groups
    animal_groups = {group: True if animal.lower() in animals else False for group, animals in groups.items()}
    
    # Add size group to the result
    animal_groups["size"] = size_group
    
    return animal_groups

def change_moon(moon):
    if pd.isnull(moon):
        return None
    else:
        return moon.lower().replace(' ', '_')

### One hot encoded version to feed clustering 

In [178]:
df = pd.read_pickle(dataset_dir + '/animals.pkl')

In [179]:
# Rearrange the dataset
df['date_time'] = pd.to_datetime(df['date_time'])

df['season'] = df['date_time'].apply(get_season)
df['night'] = df['date_time'].apply(get_time_of_day)
df['temp'] = df['temp'].apply(temperature_label)

df['moon'] = df['moon'].apply(lambda x: change_moon(x))

In [180]:
# Apply the function to the 'label' column and store the result in new columns
df_result = pd.DataFrame(df['label1'].apply(classify_animal).tolist())

# Concatenate the result with the original DataFrame
df = pd.concat([df, df_result], axis=1)

In [181]:
df.drop(columns=['date', 'time', 'image_name', 'camera', 'sha-256', 'date_time', 'label2', 'label3'], inplace=True)

In [182]:
df.rename(columns={'temp': 'temperature', 'label1': 'animal', 'moon': 'moon_phase'}, 
          inplace=True)

In [183]:
df.head()

Unnamed: 0,moon_phase,temperature,animal,season,night,bird,water_animal,predatory_animal,domestic_animal,herbivore,night_animal,size
0,full_moon,hot,wild boar,summer,night,False,False,False,False,True,False,m
1,full_moon,warm,wild boar,spring,night,False,False,False,False,True,False,m
2,first_quarter,slightly_warm,porcupine,spring,day,False,False,False,False,True,False,xs
3,waxing_gibbous,warm,fox,summer,day,False,False,True,False,False,True,s
4,first_quarter,warm,fox,summer,night,False,False,True,False,False,True,s


In [184]:
columns_to_encode = ['moon_phase', 'temperature', 'animal', 'season', 'night', 'bird', 
                     'water_animal', 'predatory_animal', 'domestic_animal', 'herbivore', 
                     'night_animal', 'size']
encoded_df = pd.get_dummies(df, columns=columns_to_encode, prefix=columns_to_encode, prefix_sep='_')
encoded_df.head()

Unnamed: 0,moon_phase_first_quarter,moon_phase_full_moon,moon_phase_last_quarter,moon_phase_new_moon,moon_phase_waning_crescent,moon_phase_waning_gibbous,moon_phase_waxing_crescent,moon_phase_waxing_gibbous,temperature_cold,temperature_cool,...,domestic_animal_False,domestic_animal_True,herbivore_False,herbivore_True,night_animal_False,night_animal_True,size_l,size_m,size_s,size_xs
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,0,0
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0,...,1,0,1,0,0,1,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,0,0,1,0


In [185]:
encoded_df.to_pickle(feed_dataset_dir + '/aculei-one-hot-encoded.pkl')