In [24]:
import pandas as pd
import numpy as np

# --------------------------
# Display settings
# --------------------------
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.max_colwidth', None)

# --------------------------
# Load datasets
# --------------------------
def load_zoo_data(path):
    df = pd.read_csv(path)
    print("=== Loaded Zoo Dataset ===")
    print(df.head(3))
    return df

def load_class_data(path):
    df = pd.read_csv(path)
    if 'Animal_Names' in df.columns:
        df['Animal_Names'] = df['Animal_Names'].astype(str)
    print("\n=== Loaded Class Dataset ===")
    print(df.head(3))
    return df

def load_json_metadata(path):
    df = pd.read_json(path)
    print("\n=== Loaded JSON Metadata (raw) ===")
    print(df.head(3))
    return df

# --------------------------
# Clean JSON
# --------------------------
def clean_json_metadata(df):
    df['habitat'] = df['habitat'].replace('', np.nan)
    df['diet'] = df['diet'].replace('', np.nan)
    df['diet'] = df['diet'].astype(str).str.lower().str.strip()
    print("\n=== Cleaned JSON Metadata ===")
    print(df.head(3))
    return df

# --------------------------
# Feature Engineering
# --------------------------
def add_engineered_features(df):
    df['habitat_risk_score'] = df['habitat'].apply(lambda x: 1 if pd.isnull(x) else 0)
    trophic_map = {'herbivore': 1, 'omnivore': 2, 'carnivore': 3, 'insectivore': 1}
    df['trophic_level'] = df['diet'].map(trophic_map)
    engineered_feature_names = ['habitat_risk_score', 'trophic_level']
    return df, engineered_feature_names

# --------------------------
# Merge datasets
# --------------------------
def merge_datasets(zoo_df, class_df, json_df):
    merged_df = pd.merge(zoo_df, json_df, on='animal_name', how='left')
    merged_df = pd.merge(merged_df, class_df, left_on='class_type', right_on='Class_Number', how='left')
    return merged_df

# --------------------------
# Main
# --------------------------
def main():
    # Load datasets
    zoo_df = load_zoo_data("zoo.csv")
    class_df = load_class_data("class.csv")
    json_df = load_json_metadata("auxiliary_metadata.json")

    # Clean and engineer features
    json_df = clean_json_metadata(json_df)
    json_df, engineered_feature_names = add_engineered_features(json_df)

    # Merge datasets
    merged_df = merge_datasets(zoo_df, class_df, json_df)

    # --------------------------
    # Display final results
    # --------------------------
    print("\n=== Dataset shape ===")
    print(merged_df.shape)

    print("\n=== Missing values ===")
    print(merged_df.isnull().sum())

    # Remove columns with lists to avoid 'unhashable type' error in duplicated()
    df_no_list = merged_df.copy()
    for col in df_no_list.columns:
        if df_no_list[col].apply(lambda x: isinstance(x, list)).any():
            df_no_list[col] = df_no_list[col].astype(str)

    print("\n=== Duplicate rows ===")
    print(df_no_list.duplicated().sum())

    print("\n=== First 3 rows of merged dataset ===")
    print(merged_df.head(3))

    print("\n=== Engineered features ===")
    print(engineered_feature_names)

# --------------------------
# Run
# --------------------------
if __name__ == "__main__":
    main()


=== Loaded Zoo Dataset ===
  animal_name  hair  feathers  eggs  milk  airborne  aquatic  predator  toothed  backbone  breathes  venomous  fins  legs  tail  domestic  catsize  class_type
0    aardvark     1         0     0     1         0        0         1        1         1         1         0     0     4     0         0        1           1
1    antelope     1         0     0     1         0        0         0        1         1         1         0     0     4     1         0        1           1
2        bass     0         0     1     0         0        1         1        1         1         0         0     1     0     1         0        0           4

=== Loaded Class Dataset ===
   Class_Number  Number_Of_Animal_Species_In_Class Class_Type  \
0             1                                 41     Mammal   
1             2                                 20       Bird   
2             3                                  5    Reptile   

                                              