# Problem1:
Implement a KNN model to classify the animals in to categories

## Import necessary libraries

In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.neighbors import KNeighborsClassifier

from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
%matplotlib notebook

In [3]:
# Matplotlib configurations

# Display interactive plots. Used this since convenient for displaying plots in github.
# %matplotlib notebook
%matplotlib notebook
# Font and figure size:
# Ref: https://stackoverflow.com/questions/3899980/how-to-change-the-font-size-on-a-matplotlib-plot
SMALL_SIZE = 8
MEDIUM_SIZE = 9
BIGGER_SIZE = 12

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [4]:
import matplotlib as mpl

# matplotlib setting
mpl.rcParams['figure.dpi'] = 200
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

# pandas setting
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Load dataset

In [5]:
zoo_df = pd.read_csv("Zoo.csv")

In [6]:
zoo_df.head(10)

Unnamed: 0,animal name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
5,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
6,calf,1,0,0,1,0,0,0,1,1,1,0,0,4,1,1,1,1
7,carp,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1,0,4
8,catfish,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
9,cavy,1,0,0,1,0,0,0,1,1,1,0,0,4,0,1,0,1


## EDA

In [7]:
zoo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   animal name  101 non-null    object
 1   hair         101 non-null    int64 
 2   feathers     101 non-null    int64 
 3   eggs         101 non-null    int64 
 4   milk         101 non-null    int64 
 5   airborne     101 non-null    int64 
 6   aquatic      101 non-null    int64 
 7   predator     101 non-null    int64 
 8   toothed      101 non-null    int64 
 9   backbone     101 non-null    int64 
 10  breathes     101 non-null    int64 
 11  venomous     101 non-null    int64 
 12  fins         101 non-null    int64 
 13  legs         101 non-null    int64 
 14  tail         101 non-null    int64 
 15  domestic     101 non-null    int64 
 16  catsize      101 non-null    int64 
 17  type         101 non-null    int64 
dtypes: int64(17), object(1)
memory usage: 14.3+ KB


In [8]:
zoo_df['type'].unique()

array([1, 4, 2, 7, 6, 5, 3], dtype=int64)

In [9]:
cols = zoo_df.columns # List of all columns in the input dataframe.
y_name = 'type'
numeric_cols = [col for col in cols if (zoo_df[col].dtypes != 'object') and col != y_name]
cat_cols = [col for col in cols if (zoo_df[col].dtypes == 'object') and col != y_name]

In [10]:
#features, numeric_cols, cat_cols = column_segregator(zoo_df, y_name='type')

In [11]:
X = zoo_df.iloc[:, 1:-1] 
y = zoo_df.iloc[:, -1]

In [12]:
fig, axes = plt.subplots(4,4, figsize=(8,6))
axes = axes.flatten()
for idx, ax in enumerate(axes):
    sns.histplot(data=X, x=X[numeric_cols[idx]], ax=ax)

fig.suptitle('Feature Distribution', ha='center', fontweight='bold')
fig.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [13]:
# Output distribution.
fig, ax = plt.subplots()
sns.countplot(x=y) # since on the x axis we have y labels.
ax.set_xlabel('animal categories')
ax.set_ylabel('count')
ax.set_title('output distribution')
plt.show()

<IPython.core.display.Javascript object>

In [14]:
# Summarizing the y distribution.
def y_count_cat(df):
    """Function to print out the number of samples in 
    each category
    Input:
    ------
    y: y column
    
    output:
    -------
    Prints the class label, number of samples, and % wrt 
    to total no of samples."""
    
    count_org_cat = Counter(df)
    for k,v in count_org_cat.items():
        per = v/len(df)*100
        print('Class = {}, n = {} ({:.3f}%)'.format(k,v,per))

In [15]:
y_count_cat(y)

Class = 1, n = 41 (40.594%)
Class = 4, n = 13 (12.871%)
Class = 2, n = 20 (19.802%)
Class = 7, n = 10 (9.901%)
Class = 6, n = 8 (7.921%)
Class = 5, n = 4 (3.960%)
Class = 3, n = 5 (4.950%)


### Observations
- 101 records and 16 feaatures, 1 columm containing names of animals in the zoo and one column containing animal category(target).
- No null values.
- Name column can be dropped since it is not a feature, rather an identifier.
- all features except  "legs" have only two unique values 0 and 1, hence the binary distribution. 
- 7 different categories of animals but imbalanced distribution.

## Data preprocessing

## Model building

In [16]:
# Define scaler
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler() )
])

In [17]:
# Define preprocessor.
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols)
])

In [18]:
# Kfold cross validation.
knn_classifier = KNeighborsClassifier(n_neighbors=1)

In [19]:
k_clf = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', knn_classifier)
                       ])

In [20]:
cv_scores = []

for i in range (1, 21, 2):
    knn_classifier = KNeighborsClassifier(n_neighbors=i)
    k_clf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', knn_classifier)
                            ])
    crv_scores = cross_val_score(estimator=k_clf, X=X, y=y, cv=3)
    cv_scores.append(crv_scores.mean())
    

### Visualizations - to findoptimal k values.

In [21]:
fig, ax = plt.subplots()
ax.plot(range (1, 21, 2), cv_scores)
ax.set_xlabel('Neighbours')
ax.set_ylabel('Mean accuracy')
plt.title('Neighbours v/s CV scores  (k=3 folds)')
plt.show()

<IPython.core.display.Javascript object>

In [22]:
print(" Max accuracy score from cross validation:{:.4f}".format(max(cv_scores)))
print(" Corresponding k value                   :{}".format((cv_scores.index(max(cv_scores))*2+1)))

 Max accuracy score from cross validation:0.9504
 Corresponding k value                   :1


## Observations:
- From the above analysis, we see that , the best results (accuracy = 0.95) from KNN classifier are obtained when k=1.

In [23]:
oversample = SMOTE(k_neighbors=3) # If default 5 is chosen, we get an error.

In [24]:
X_os, y_os = oversample.fit_resample(X,y,)

In [25]:
# Distribution of target labels.
fig, ax = plt.subplots()
sns.countplot(x=y_os) #since on the x axis , we have y labels.
ax.set_xlabel('Glass categories')
ax.set_ylabel('count')
ax.set_title('Output distribution after oversampling')
plt.show()

<IPython.core.display.Javascript object>

In [26]:
# Summary of the labels for oversampled distribution
y_count_cat(y_os)

Class = 1, n = 41 (14.286%)
Class = 4, n = 41 (14.286%)
Class = 2, n = 41 (14.286%)
Class = 7, n = 41 (14.286%)
Class = 6, n = 41 (14.286%)
Class = 5, n = 41 (14.286%)
Class = 3, n = 41 (14.286%)


In [27]:
# Kfold cross validation:
cv_scores_os = []
for i in range(1,31,1):
    knn_classifier_os = KNeighborsClassifier(n_neighbors=i)
    k_clf_os = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', knn_classifier_os)
                           ])#
    crv_scores_os = cross_val_score(estimator=k_clf_os, X=X_os, y=y_os, cv=5)
    cv_scores_os.append(crv_scores_os.mean())

### Visualizations to find optimal k value.

In [28]:
fig, ax = plt.subplots()
ax.plot(range(1, 31, 1), cv_scores_os)
ax.set_xlabel('Neighbours')
ax.set_ylabel('Mean accuracy')
plt.title('Neighbours v/s CV scores (k=5 folds)')
plt.show()

<IPython.core.display.Javascript object>

In [29]:
print('Max accuracy score from cross validation: {:.4f}'.format(max(cv_scores_os)))
print('Corresponding k value                   : {}'.format((cv_scores_os.index(max(cv_scores_os)))+1))

Max accuracy score from cross validation: 1.0000
Corresponding k value                   : 1


## Observations:
- from the iterations for different values of k and cross validation, we find that for k=1, we get the maximum accuracy.

## SMOTE for handling imbalanced data like the Glass dataset
Ref: https://machinelearningmastery.com/multi-class-imbalanced-classification/

**Note**: SMOTE oversamples all classes to have the same number of examples as the class with the most examples(in our case, class 1).

In [30]:
knn_classifier_os = KNeighborsClassifier(n_neighbors=1)
k_clf_os = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', knn_classifier_os)
                           ])

In [31]:
k_clf_os.fit(X_os, y_os)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   MinMaxScaler())]),
                                                  ['hair', 'feathers', 'eggs',
                                                   'milk', 'airborne',
                                                   'aquatic', 'predator',
                                                   'toothed', 'backbone',
                                                   'breathes', 'venomous',
                                                   'fins', 'legs', 'tail',
                                                   'domestic', 'catsize'])])),
                ('classifier', KNeighborsClassifier(n_neighbors=1))])

In [32]:
y_pred = k_clf_os.predict(X)

In [33]:
print('Accuracy score for final model: {}'.format(accuracy_score(y, y_pred)))

Accuracy score for final model: 1.0


**Note:** Since only the training dataset was provided, at best, the model will perform perfectly on the training set. The real test for the model is when a test dataset is provided to the model. 

## Conclusion:
A KNN classifier was used to classify the animals in the dataset provoded.