# Step 3 - Importing modules

In [16]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# Supervised classification
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Logistic Regression
from sklearn.linear_model import LogisticRegression


# Step 4 - Read csv into a pandas dataset

In [2]:
df_zoo = pd.read_csv("zoo.csv")

# Step 5 - Use info method

In [21]:
df_zoo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 101 entries, 0 to 91
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal_name  101 non-null    object
 1   hair         101 non-null    int64 
 2   feathers     101 non-null    int64 
 3   eggs         101 non-null    int64 
 4   milk         101 non-null    int64 
 5   airborne     101 non-null    int64 
 6   aquatic      101 non-null    int64 
 7   predator     101 non-null    int64 
 8   toothed      101 non-null    int64 
 9   backbone     101 non-null    int64 
 10  breathes     101 non-null    int64 
 11  venomous     101 non-null    int64 
 12  fins         101 non-null    int64 
 13  legs         101 non-null    int64 
 14  tail         101 non-null    int64 
 15  domestic     101 non-null    int64 
 16  class_type   101 non-null    int64 
 17  Class_Type   101 non-null    object
dtypes: int64(16), object(2)
memory usage: 15.0+ KB


# Step 6 - Read csv into a pandas dataset

In [4]:
df_class = pd.read_csv("class.csv", index_col="Class_Number")
df_class

Unnamed: 0_level_0,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
Class_Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,41,Mammal,"aardvark, antelope, bear, boar, buffalo, calf,..."
2,20,Bird,"chicken, crow, dove, duck, flamingo, gull, haw..."
3,5,Reptile,"pitviper, seasnake, slowworm, tortoise, tuatara"
4,13,Fish,"bass, carp, catfish, chub, dogfish, haddock, h..."
5,4,Amphibian,"frog, frog, newt, toad"
6,8,Bug,"flea, gnat, honeybee, housefly, ladybird, moth..."
7,10,Invertebrate,"clam, crab, crayfish, lobster, octopus, scorpi..."


In [5]:
df_class.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 1 to 7
Data columns (total 3 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Number_Of_Animal_Species_In_Class  7 non-null      int64 
 1   Class_Type                         7 non-null      object
 2   Animal_Names                       7 non-null      object
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes


# Step 7 - Merged dataframes using join method

In [6]:
df_zoo = df_zoo.join(df_class, how='inner', on='class_type')
# To check if there are any duplicated. We got the 101 rows of df_zoo with the 21 columns from the two sets
df_zoo.shape 

(101, 21)

# Step 8 - Dropping unwanted columns

In [7]:
df_zoo.drop(['catsize','Animal_Names', 'Number_Of_Animal_Species_In_Class'], axis=1, inplace=True)
df_zoo

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,class_type,Class_Type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,Mammal
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,Mammal
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,Mammal
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,Mammal
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,Mammal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,pitviper,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,3,Reptile
76,seasnake,0,0,0,0,0,1,1,1,1,0,1,0,0,1,0,3,Reptile
80,slowworm,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,3,Reptile
90,tortoise,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,3,Reptile


# Step 9 - Split the data into training and testing data

In [8]:
x = df_zoo.drop(labels=["animal_name", "class_type", "Class_Type"],axis=1)
y = df_zoo["class_type"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

print(f"size of x_train: {x_train.shape}")
print(f"size of y_train: {y_train.shape}")
print(f"size of x_test: {x_test.shape}")
print(f"size of y_test: {y_test.shape}")

size of x_train: (70, 15)
size of y_train: (70,)
size of x_test: (31, 15)
size of y_test: (31,)


# Step 10 - Create the model and fit it to the training data.

In [9]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=1)

# Step 11 - Predict values based on testing data

In [10]:
prediction = knn.predict(x_test)
prediction

array([2, 1, 4, 2, 5, 4, 1, 4, 4, 1, 2, 6, 1, 1, 2, 2, 2, 4, 6, 3, 2, 6,
       5, 7, 2, 6, 6, 2, 4, 3, 7])

# Step 12 - Print out the classification report for the y test data and the predictions

In [11]:
# print(classification_report(y_test, prediction))
print(classification_report(y_test, prediction, zero_division=0)) # To avoid warnings of division by zero

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         9
           3       0.50      1.00      0.67         1
           4       1.00      1.00      1.00         6
           5       1.00      0.67      0.80         3
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         2

    accuracy                           0.97        31
   macro avg       0.93      0.95      0.92        31
weighted avg       0.98      0.97      0.97        31



# Step 13 - Interpretation of the results

### Precision 

It indicates the percentage of the predictions that were correct for each class.

### Recall 

It indicates the percentage of the class that were predicted correctly.

### F1-score 

It embeds precision and recall into one measure such that the best score is 1.0 and the worst is 0.

F1 Score = 2*(Recall * Precision) / (Recall + Precision)

### Support 

It indicates the number of actual occurrences of the class in the dataset. The support in the training data must be balanced.

#### In this report we can appreciate that the model have a great accuracy. 
#### It identified all the elements in each class except for the class 5 where only the 2 of 3 elements were classified as 5. One of them was classified as 3, affecting the precision associated to the class 3 a 50%. 
#### That was the only mistake of the model for the test dataset.

# Step 14 - Repeat steps 10 to 13 using k=3.

## Create the model and fit it to the training data.

In [12]:
knn_3 = KNeighborsClassifier(n_neighbors=3)
knn_3.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=3)

## Predict values based on testing data.

In [13]:
prediction_3 = knn_3.predict(x_test)
prediction_3

array([2, 1, 4, 2, 3, 4, 1, 4, 4, 1, 2, 6, 1, 1, 2, 2, 2, 4, 6, 3, 2, 6,
       3, 7, 2, 6, 6, 2, 4, 1, 7])

## Classification report.

In [14]:
# print(classification_report(y_test, prediction_3))
print(classification_report(y_test, prediction_3, zero_division=0)) # To avoid warnings of division by zero

              precision    recall  f1-score   support

           1       0.83      1.00      0.91         5
           2       1.00      1.00      1.00         9
           3       0.33      1.00      0.50         1
           4       1.00      1.00      1.00         6
           5       0.00      0.00      0.00         3
           6       1.00      1.00      1.00         5
           7       1.00      1.00      1.00         2

    accuracy                           0.90        31
   macro avg       0.74      0.86      0.77        31
weighted avg       0.85      0.90      0.87        31



## Interpretation of the results.

#### In this report we can appreciate that the model with k = 3 has a lower accuracy than the previous model with k = 1.
#### In this case, none of the three elements of the class 5 were correctly classified. Two of them were classified as class 1, while the other one was assigned to class 3. In view of that, the precision of the classes 1 and 3 were compromised.

# Step 15 - Logistic Regression.

In [17]:
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train,y_train)

LogisticRegression(max_iter=1000)

## Predict values based on testing data.

In [18]:
prediction_lr = lr.predict(x_test)
prediction_lr

array([2, 1, 4, 2, 3, 4, 1, 4, 4, 1, 2, 7, 1, 1, 2, 2, 2, 4, 6, 3, 2, 6,
       3, 7, 2, 7, 6, 2, 4, 3, 7])

## Classification report.

In [19]:
# print(classification_report(y_test, prediction_lr))
print(classification_report(y_test, prediction_lr, zero_division=0)) # To avoid warnings of division by zero

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         9
           3       0.25      1.00      0.40         1
           4       1.00      1.00      1.00         6
           5       0.00      0.00      0.00         3
           6       1.00      0.60      0.75         5
           7       0.50      1.00      0.67         2

    accuracy                           0.84        31
   macro avg       0.68      0.80      0.69        31
weighted avg       0.85      0.84      0.82        31



# Step 16 - Results Comparison.

#### The accuracy of the logistic regression model is the worst of the three models used in this case study. In this model, the three elements of the class 5 plus two of elements of the class 6 were distributed in the classes 3 and 7 affecting the precision in those groups.

#### To support our conclusions we can compare the F1-Score of the three models as this measure combines both Precision and Recall. 

#### The best model based on all the previous analysis is the KNN model with K = 1.

In [31]:
best_classifier = knn

# Step 17 - Evaluate the best model with two selected rows.

In [32]:
# selecting rows with Id 3 and 90 
# these are from different classes (Mammal and Reptile)
sample_ids = [3, 90]
sample = df_zoo.loc[sample_ids]
sample

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,class_type,Class_Type
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,Mammal
90,tortoise,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,3,Reptile


In [33]:
sample_features = sample.drop(labels=["animal_name", "class_type", "Class_Type"],axis=1)
predicted_class = best_classifier.predict(sample_features)
predicted_class

array([1, 3])

In [34]:
sample["Predicted"] = predicted_class
sample["Correct"] = sample["class_type"] == sample["Predicted"]

sample

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,class_type,Class_Type,Predicted,Correct
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,Mammal,1,True
90,tortoise,0,0,1,0,0,0,0,0,1,1,0,0,4,1,0,3,Reptile,3,True


In [36]:
# Show only columns meaningful to the reader
sample[["animal_name", "Class_Type", "class_type", "Predicted", "Correct"]]

Unnamed: 0,animal_name,Class_Type,class_type,Predicted,Correct
3,bear,Mammal,1,1,True
90,tortoise,Reptile,3,3,True


In the column "Predicted" we can see the class predicted by the selected model (knn) while in the column "Correct" we can see if the prediction actually matches with the real class.
In this case, the two selected animals (bear and tortoise) are classified correctly.

# Step 18 - KNN vs Logistic Regression

## K-Nearest Neighbours

KNN is a supervised machine learning algorithms which means we use labeled dataset to predict the class of new data points. The KNN algorithm uses a majority voting mechanism. It collects data from a training data set and uses this data later to make predictions for new records.

### Strengths of K-Nearest Neighbours

•	It is extremely intuitive and simple to put into action. 
•	It is a non-parametric algorithm which means there are no assumptions to be met in order to apply the model. 
•	It does not require to build any model; it simply tags the new data entry based on learning from historical data.
•	It follows a memory-based approach, allowing the algorithm to adapt as we collect new training data.
•	It can be used for both classification and regression problems.
•	It can handle large volumes of training data.
•	It makes highly accurate predictions, competing with the most accurate models.

### Weaknesses of K-Nearest Neighbours

•	It is highly dependent of the value of K, which can be difficult to identify.
•	It could present performance issues as the data set grows.
•	It struggles to predict the output of new observation in high-dimensional datasets.
•	It requires that features have the same scale, since absolute differences in features weight the same.
•	It does not perform well on imbalanced data.
•	It is sensitive to outliers as it simply chose the neighbors based on distance criteria.
•	It has no capability of dealing with missing values.

## Logistic Regression

The Logistic Regression model is a type of statistical analysis often used for predictive analytics and machine learning. It is used to understand the relationship between the dependent variable and one or more independent variables by estimating probabilities using a logistic regression equation.

In this analytics approach, the dependent variable is finite or categorical: either two values (binary regression) or a range of finite options (multinomial regression). 


### Strengths of Logistic Regression

•	It is one of the simplest models making it easy to implement and understand. 
•	It is extremely efficient as it does not require too many calculations compared with other models.
•	It is easy to update to respond to data changes.
•	It is very reliable as it does not only provide the final classification but also includes well-calibrated probabilities associated with each result.
•	It is less prone to overfitting in low-dimensional datasets.
•	It is extremely accurate for simple datasets with linearly separable features.

### Weaknesses of Logistic Regression

•	It tends to be over-fitted in high-dimensional datasets.
•	It cannot solve non-linear problems.
•	It struggles to capture complex relationships in the data.
•	It may result in over-fit when the number of observations is lower than the number of features in the data.


# Conclusions
When deciding which prediction model to use, we should consider the mentioned strengths and weaknesses of KNN and LR to identify which one is more suitable to the business case.
