# Classification models based on data about pets

## Libraries and settings

In [None]:
# Libraries
import os
import random
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Simulate pet data

In [None]:
# Set a seed for reproducibility
random.seed(42)

# Function to generate pets
def pets(pet_type, weight_min, weight_max, height_min, height_max, n_animals=1000):

    pets = pd.DataFrame({
            'age': [random.randint(1, 15) for _ in range(n_animals)],
            'color': [random.choice(['red', 'blue', 'green', 'yellow']) for _ in range(n_animals)],
            'weight': [random.uniform(weight_min, weight_max) for _ in range(n_animals)],
            'height': [random.uniform(height_min, height_max) for _ in range(n_animals)],
            'eats_meat': [random.choice(['yes', 'no']) for _ in range(n_animals)],
            'pets': [pet_type for _ in range(n_animals)]})

    return pets

# Simulate data about dogs, cats, birds, and fishes
dogs = pets('dog', weight_min=5, weight_max=80, height_min=20, height_max=90)
cats = pets('cat', weight_min=2, weight_max=8, height_min=20, height_max=40)
birds = pets('bird', weight_min=0.2, weight_max=7, height_min=10, height_max=80)
fishes = pets('fish', weight_min=0.1, weight_max=30, height_min=10, height_max=30)

# Concatenate all animal data to create a single dataframe
df_orig = pd.concat([dogs, cats, birds, fishes])

# Shuffle the data
df_orig = df_orig.sample(frac=1, random_state=42).reset_index(drop=True)

# Show the first few rows
df_orig.head()

## Convert categorical features to numerical using one-hot encoding

In [None]:
# Convert categorical feature to numerical using one-hot encoding (exclude column 'pets')


### Create train and test samples (train = 80%, test = 20% of the data)

In [None]:
# Create train and test samples


# Show X_train


# Show y_train


## Fit a classification tree

In [None]:
# Create decision tree regressor object with max_depth = 3


# Train decision tree regressor


# Predict the response for test dataset


## Use cross-validation to evaluate the model performance

In [None]:
# Cross-validation (5-fold) with X_test and y_test, cv=5 and scoring='accuracy'


# Result


## Plot the classification tree

In [None]:
# Get unique class names from df_orig['pets']


# For the meaning of numbers in each box, look at the root node


# Plot the 


## Fit the Random Forest Classifier

In [None]:
# Create a random forest Classifier (by convention, clf means 'classifier')


# Predict the response for test dataset


# Calculate accuracy


## Show feature importance

In [None]:
# Get colnames from X_train


# Derive feature importance from random forest


# Print col-names and importances-values


# Barplot with feature importance


### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')