## Exploratory Data Analysis
### On Animals data downloaded from Data Service API
#### Possible animals considered: Kangaroo, Elephant, Chicken, and Dog


In [None]:
# Import packages
from datetime import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score


from src.api_call import fetch_animals, fetch_schema
from src.minio_connection import minio_save_model

In [None]:
# Load data
df = pd.DataFrame(fetch_animals(1000)[0])
df

In [None]:
# Group and aggregate: mean height, mean weight, and count (frequency)
agg_table = (
    df.groupby(['has_tail', 'has_wings', 'walks_on_n_legs'])
      .agg(
          avg_height=('height', 'mean'),
          avg_weight=('weight', 'mean'),
          frequency=('height', 'count')
      )
      .reset_index()
)

print(agg_table)

df.describe()

In [None]:
# Filter out impossible data combinations:
# 1. Animals can only walk with 2 or 4 legs
df = df[df['walks_on_n_legs'].isin([2, 4])]

# 2. Animals with wings can only walk in 2 legs
df = df[~(df['walks_on_n_legs'] == 4) | (df['has_wings'] == False)]

# 3. All animals in the dataset have a tail
df = df[df['has_tail'] == True]

In [None]:
# Group and aggregate: mean height, mean weight, and count (frequency)
agg_table = (
    df.groupby(['has_tail', 'has_wings', 'walks_on_n_legs'])
      .agg(
          avg_height=('height', 'mean'),
          avg_weight=('weight', 'mean'),
          frequency=('height', 'count')
      )
      .reset_index()
)

print(agg_table)

In [None]:
# Based in this analysis it can be inferred the class for all animals which walk on 2 legs:

df['animal_type'] = None

# Set to 'chicken' if walks on 2 legs and has wings
df.loc[(df['walks_on_n_legs'] == 2) & (df['has_wings'] == True), 'animal_type'] = 'chicken'

# Set to 'kangaroo' if walks on 2 legs and no wings
df.loc[(df['walks_on_n_legs'] == 2) & (df['has_wings'] == False), 'animal_type'] = 'kangaroo'


In [None]:
df_4legs = df[(df['walks_on_n_legs'] == 4)].drop(columns=['animal_type'])

In [None]:
plt.scatter(df_4legs['height'],df_4legs['weight'])  
plt.xlabel('Height')
plt.ylabel('Weight')
plt.title('Height vs Weight')
plt.show()

In [None]:
# TODO: Create clusteing model to classify Dogs vs Elephants

# Set to 'elephant' if heavy
df.loc[(df['weight'] >= 1500) & (df['animal_type'].isnull()), 'animal_type'] = 'elephant'

# Set to 'dog' if light
df.loc[(df['weight'] < 1500) & (df['animal_type'].isnull()), 'animal_type'] = 'dog'

In [None]:
# Plotting weight and height for each animal type
plt.figure(figsize=(14, 6))

# Boxplot for Height
plt.subplot(1, 2, 1)
sns.boxplot(x='animal_type', y='height', data=df)
plt.title('Height by Animal Type')
plt.xlabel('Animal Type')
plt.ylabel('Height')

# Boxplot for Weight
plt.subplot(1, 2, 2)
sns.boxplot(x='animal_type', y='weight', data=df)
plt.title('Weight by Animal Type')
plt.xlabel('Animal Type')
plt.ylabel('Weight')

plt.tight_layout()
plt.show()


In [None]:
def remove_outliers_iqr(df):
    
    df_filtered = pd.DataFrame()
    cols_to_filter=['height', 'weight']

    for name, group in df.groupby('animal_type'):
        group_filtered = group.copy()
        for col in cols_to_filter:
            Q1 = group[col].quantile(0.25)
            Q3 = group[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            group_filtered = group_filtered[(group_filtered[col] >= lower) & (group_filtered[col] <= upper)]
        df_filtered = pd.concat([df_filtered, group_filtered], ignore_index=True)
    
    return df_filtered

df_cleaned = remove_outliers_iqr(df, )

In [None]:
# Plotting weight and height for each animal type
plt.figure(figsize=(14, 6))

# Boxplot for Height
plt.subplot(1, 2, 1)
sns.boxplot(x='animal_type', y='height', data=df_cleaned)
plt.title('Height by Animal Type')
plt.xlabel('Animal Type')
plt.ylabel('Height')

# Boxplot for Weight
plt.subplot(1, 2, 2)
sns.boxplot(x='animal_type', y='weight', data=df_cleaned)
plt.title('Weight by Animal Type')
plt.xlabel('Animal Type')
plt.ylabel('Weight')

plt.tight_layout()
plt.show()

In [None]:
# Split data for training and testing
X = df_cleaned[['height', 'weight', 'walks_on_n_legs', 'has_wings', 'has_tail']]
y = df_cleaned['animal_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a grid serch to optimize a Desicion Tree Classifier
param_grid = {
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
model = grid_search.best_estimator_

y_predict = model.predict(X_test)

# Check several performance metrics
acc = accuracy_score(y_test, y_predict)
precision = precision_score(y_test, y_predict, average='macro')
recall = recall_score(y_test, y_predict, average='macro')
f1 = f1_score(y_test, y_predict, average='macro')

metrics = {
    "accuracy": round(acc, 4),
    "precision": round(precision, 4),
    "recall": round(recall, 4),
    "f1_score": round(f1, 4)
}

print(metrics)


In [None]:
# Save model to minio
minio_save_model(model=model, metrics=metrics)

In [None]:
# Print confuxion matrix
cm = confusion_matrix(y_test, y_predict)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

# Plot the tree
plt.figure(figsize=(16, 8))
plot_tree(model, 
          feature_names=X.columns, 
          class_names=model.classes_, 
          filled=True, 
          rounded=True, 
          fontsize=10)
plt.title("Decision Tree Visualization")
plt.show()