# Fundamentals of Machine Learning - Sloths classification
* Sloths are really special animals
* Lets play with a [sloth-dataset](https://www.kaggle.com/datasets/bertiemackie/sloth-species) with records about two species and six sub species
* Try to classify the animals corectly!

![img01](https://github.com/rasvob/VSB-FEI-Fundamentals-of-Machine-Learning-Exercises/blob/master/images/sloth01.jpg?raw=true)

![img02](https://github.com/rasvob/VSB-FEI-Fundamentals-of-Machine-Learning-Exercises/blob/master/images/sloth02.jpg?raw=true)

<!-- 
<img src="https://upload.wikimedia.org/wikipedia/commons/1/18/Bradypus.jpg" height="500">

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/9/9b/Choloepus_didactylus_2_-_Buffalo_Zoo.jpg/2560px-Choloepus_didactylus_2_-_Buffalo_Zoo.jpg" height="500"> -->

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# Read dataset from the file
* Drop the index file included in file and show the example of it

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/rasvob/VSB-FEI-Fundamentals-of-Machine-Learning-Exercises/master/datasets/sloth_data.csv").drop(columns=["Unnamed: 0"])

In [None]:
df

# Lets check how the classes are distributed the in the dataset
* For both specie and sub_specie of sloths

In [None]:
px.histogram(df, x="specie")

In [None]:
px.histogram(df, x="sub_specie")

# Let's check the distribution of the numerical attributes

In [None]:
px.box(df, y=["claw_length_cm", "size_cm", "tail_length_cm", "weight_kg"])

In [None]:
px.box(df, y=["claw_length_cm", "size_cm", "tail_length_cm", "weight_kg"], color="specie")

In [None]:
px.box(df, y=["claw_length_cm", "size_cm", "tail_length_cm", "weight_kg"], color="sub_specie")

In [None]:
px.scatter_matrix(df, dimensions=["claw_length_cm", "size_cm", "tail_length_cm", "weight_kg"], color="specie")

In [None]:
px.scatter_matrix(df, dimensions=["claw_length_cm", "size_cm", "tail_length_cm", "weight_kg"], color="sub_specie")

# Let's convert the categorical data into numerical represenations

In [None]:
df["endangered"], unique_endagered = pd.factorize(df["endangered"], sort=True)
df["specie"], unique_specie =  pd.factorize(df["specie"], sort=True)
df["sub_specie"], unique_sub_specie =  pd.factorize(df["sub_specie"], sort=True)
df = df.infer_objects()

In [None]:
df

# 📌 Parallel coordinates plot is useful when you like to follow the trends in data based on the class they represent
* 💡 When many objects are drawn the main clusters may be visible in the plot

In [None]:
px.parallel_coordinates(df, color="specie", dimensions=["claw_length_cm", "size_cm", "tail_length_cm", "weight_kg"])

## It's more complicated when more classes are present in the data

In [None]:
px.parallel_coordinates(df, color="sub_specie", dimensions=["claw_length_cm", "size_cm", "tail_length_cm", "weight_kg"])

# 🚀 Classification of the data
* Split the datasets into `X` input matrix and `y` output vector

In [None]:
X = df.drop(columns=["specie", "sub_specie"]).values
ys = df["specie"].values
yss = df["sub_specie"].values

In [None]:
X.shape, ys.shape, yss.shape

# Split the dataset into training and testing subsets

In [None]:
X_train, X_test, ys_train, ys_test, yss_train, yss_test = train_test_split(X, ys, yss, test_size=0.25, random_state=42)

In [None]:
X_train.shape, ys_train.shape, yss_train.shape

# Let's build the models
* 💡 Try KNeighborsClassifier, GaussianNB, DecisionTreeClassifier, RandomForrest and MultiLayerPerceptron aka Neural network

In [None]:
models = [
    ["KNN(3)", KNeighborsClassifier(3)],
    ["KNN(5)", KNeighborsClassifier(5)],
    ["KNN(7)", KNeighborsClassifier(7)],
    ["GaussianNB", GaussianNB()],
    ["DecisionTree", DecisionTreeClassifier()],
    ["RandomForrest(10)", RandomForestClassifier(10)],
    ["RandomForrest(20)", RandomForestClassifier(20)],
    ["RandomForrest(50)", RandomForestClassifier(50)],
    ["RandomForrest(100)", RandomForestClassifier(100)],
    ["MLP(10)", MLPClassifier(10)],
    ["MLP(20)", MLPClassifier(20)],
    ["MLP(50)", MLPClassifier(50)],
]

## Fit the models and evaluate them 

In [None]:
for name, model in models:
  model.fit(X_train, ys_train)
  trainp = model.score(X_train, ys_train)
  testp = model.score(X_test, ys_test)
  print(f"{name:20s} Train: {trainp:.3f}%    Test: {testp:.3f}%")

## Let's check the scaled version of the dataset

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

sX_train = scaler.transform(X_train)
sX_test = scaler.transform(X_test)

## Fit the models and evaluate them again

In [None]:
for name, model in models:
  model.fit(sX_train, ys_train)
  trainp = model.score(sX_train, ys_train)
  testp = model.score(sX_test, ys_test)
  print(f"{name:20s} Train: {trainp:.3f}%    Test: {testp:.3f}%")

# ⚡ Let's try more specific classification into 5 classes

In [None]:
print(f"                     Non-Scaled                      Scaled")
for name, model in models:
  model.fit(X_train, yss_train)
  trainp = model.score(X_train, yss_train)
  testp = model.score(X_test, yss_test)
  model.fit(sX_train, yss_train)
  strainp = model.score(sX_train, yss_train)
  stestp = model.score(sX_test, yss_test)
  print(f"{name:20s} Train: {trainp:.3f}%    Test: {testp:.3f}%   Train: {strainp:.3f}%    Test: {stestp:.3f}%")

## 🌳 Check the confusion matrix of one of the best model - Random Forest with 20 inner trees

In [None]:
model = RandomForestClassifier(20)
model.fit(sX_train, yss_train)
predicted = model.predict(sX_test)

cm = confusion_matrix(yss_test, predicted, labels=model.classes_)
cm

In [None]:
ConfusionMatrixDisplay(cm, display_labels=model.classes_).plot()