# Classification models based on data about pets

## Libraries and settings

In [None]:
# Libraries
import os
import random
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Show current working directory
print(os.getcwd())

## Simulate pet data

In [None]:
# Set a seed for reproducibility
random.seed(42)

# Function to generate pets
def pets(pet_type, weight_min, weight_max, height_min, height_max, n_animals=1000):

    pets = pd.DataFrame({
            'age': [random.randint(1, 15) for _ in range(n_animals)],
            'color': [random.choice(['red', 'blue', 'green', 'yellow']) for _ in range(n_animals)],
            'weight': [random.uniform(weight_min, weight_max) for _ in range(n_animals)],
            'height': [random.uniform(height_min, height_max) for _ in range(n_animals)],
            'eats_meat': [random.choice(['yes', 'no']) for _ in range(n_animals)],
            'pets': [pet_type for _ in range(n_animals)]})

    return pets

# Simulate animals
dogs = pets('dog', weight_min=5, weight_max=80, height_min=20, height_max=90)
cats = pets('cat', weight_min=2, weight_max=8, height_min=20, height_max=40)
birds = pets('bird', weight_min=0.2, weight_max=7, height_min=10, height_max=80)
fishes = pets('fish', weight_min=0.1, weight_max=30, height_min=10, height_max=30)

df_orig = pd.concat([dogs, cats, birds, fishes])
df_orig

## Convert categorical features to numerical using one-hot encoding

In [None]:
X = pd.get_dummies(df_orig.drop('pets', axis=1))
X

### Create train and test samples (train = 80%, test = 20% of the data)

In [None]:
# Create train and test samples
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    df_orig['pets'], 
                                                    test_size=0.20, 
                                                    random_state=42)

# Show X_train
print('X_train:')
print(X_train.head(), '\n')

# Show y_train
print('y_train:')
print(y_train.head())

## Fit a classification tree

In [None]:
# Create decision tree regressor object
clf = DecisionTreeClassifier(random_state=20, 
                             max_depth=3)

# Train decision tree regressor
clf = clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

## Use cross-validation to evaluate the model performance

In [None]:
# Cross-validation (5-fold)
cv = cross_val_score(clf, 
                     X_test, 
                     y_test, 
                     cv=5, 
                     scoring='accuracy')

# Result
print(list(cv.round(4)), end="")

## Plot the classification tree

In [None]:
# Get unique class names
unique_class_names = df_orig['pets'].unique()

# For the meaning of numbers in each box, look at the root node
fig = plt.figure(figsize=(9,6))

# Plot the 
tree_plot = tree.plot_tree(clf,
                           feature_names=list(X_train.columns),
                           class_names=['fish', 'cat', 'dog', 'bird'],
                           filled=True,
                           fontsize=7,
                           label='root',
                           precision=1)

## Fit the Random Forest Classifier

In [None]:
clf_rf = RandomForestClassifier(n_estimators=500, 
                               max_depth=10, 
                               random_state=42)
clf_rf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf_rf.predict(X_test)

# Calculate accuracy
print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred):.4f}')

## Show feature importance

In [None]:
cols = X_train.columns

# Derive feature importance from random forest
importances = clf_rf.feature_importances_
std         = np.std([tree.feature_importances_ for tree in clf_rf.estimators_], axis=0)
indices     = np.argsort(importances)[::-1]

# Print col-names and importances-values
print( cols[indices] )
print( importances[indices] )

# Barplot with feature importance
df_fi = pd.DataFrame({'features':cols,'importances': importances})
df_fi.sort_values('importances', inplace=True)
df_fi.plot(kind='barh', 
           y='importances', 
           x='features', 
           color='darkred', 
           figsize=(6,3))

### Jupyter notebook --footer info-- (please always provide this at the end of each notebook)

In [None]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')