# Classification Model

In this notebook, I will show you how to build a classification model (to predict gender of penguin) with seaborn lib.


1. Python
2. Seaborn Data
3. Classification Pipeline

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Pipeline

```
[0. Data] -> [Step 1. Data Processing] -> [Step 2. Feature Selection] -> [Step 3. Build Model]  (  [3.1 Evaluation] -> [3.2 Make A Prediction] )
```

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'taxis',
 'tips',
 'titanic']

## Step 0. Load the dataset

In [3]:
df = sns.load_dataset('penguins')
df.head(10)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,Female
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,Male
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,


## Step 1. Data Processing

In [4]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


### Dealing with the Missing Values

#### Create a copy of the dataset

In [5]:
df1 = df.copy()
df1.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [6]:
pd.get_dummies(df1['sex']).head()

Unnamed: 0,Female,Male
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


In [7]:
se=pd.get_dummies(df1['sex'], drop_first=True)
se.head(3)

Unnamed: 0,Male
0,1
1,0
2,0


### Label Encoding

In [94]:
df1.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [105]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [107]:
df1['island']= le.fit_transform(df1['island'])
df1['island']

0      2
1      2
2      2
4      2
5      2
      ..
338    0
340    0
341    0
342    0
343    0
Name: island, Length: 333, dtype: int64

In [108]:
df1.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,2,39.1,18.7,181.0,3750.0,Male
1,Adelie,2,39.5,17.4,186.0,3800.0,Female
2,Adelie,2,40.3,18.0,195.0,3250.0,Female
4,Adelie,2,36.7,19.3,193.0,3450.0,Female
5,Adelie,2,39.3,20.6,190.0,3650.0,Male


### Concatenation

In [109]:
df2 = pd.concat([df1, se], axis=1)

In [110]:
df2.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,Male
0,Adelie,2,39.1,18.7,181.0,3750.0,Male,1
1,Adelie,2,39.5,17.4,186.0,3800.0,Female,0
2,Adelie,2,40.3,18.0,195.0,3250.0,Female,0
4,Adelie,2,36.7,19.3,193.0,3450.0,Female,0
5,Adelie,2,39.3,20.6,190.0,3650.0,Male,1


## Step 2. Feature Selection

### Drop the species column from df1

In [112]:
#### Drop the species column from df1
df2.drop(['species','sex'], inplace=True, axis=1)
df2.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,Male
0,2,39.1,18.7,181.0,3750.0,1
1,2,39.5,17.4,186.0,3800.0,0
2,2,40.3,18.0,195.0,3250.0,0
4,2,36.7,19.3,193.0,3450.0,0
5,2,39.3,20.6,190.0,3650.0,1


## Step 3. Build Model

In [113]:
from sklearn.model_selection import train_test_split
X = df2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=40)

### 3.1 Model selection   Random Forest Classifier

In [114]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

model1 = rfc.fit(X_train, y_train)
prediction1 = model1.predict(X_test)

print("Acc on training data: {:,.3f}".format(rfc.score(X_train, y_train)))
print("Acc on test data: {:,.3f}".format(rfc.score(X_test, y_test)))

Acc on training data: 1.000
Acc on test data: 1.000


In [115]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### 3.2 Evaluation

In [116]:
print(confusion_matrix(y_test, prediction1))

[[41  0  0]
 [ 0 23  0]
 [ 0  0 36]]


In [117]:
print(classification_report(y_test, prediction1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        36

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [82]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [83]:
models=[]

models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('rfc', RandomForestClassifier()))


In [35]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [84]:
# evaluate each model

results =[]
names=[]
for name , model in models:
    kfold=KFold(n_splits=10, random_state=40)
    cv_results= cross_val_score(model, X_train, y_train, cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)

    msg= '%s:, %f, (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)



KNN:, 0.755616, (0.060228)
NB:, 0.961413, (0.048394)
SVM:, 0.747283, (0.076540)
rfc:, 0.986957, (0.039130)


In [37]:
# make predictions on test datasets

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
predictions= rfc.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        36

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

[[41  0  0]
 [ 0 23  0]
 [ 0  0 36]]


In [38]:
NB = GaussianNB()
NB.fit(X_train, y_train)
predcitions = NB.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        41
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00        36

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

[[41  0  0]
 [ 0 23  0]
 [ 0  0 36]]


In [39]:
KNN = KNeighborsClassifier()
KNN.fit(X_train, y_train)
predictions= KNN.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

0.78
              precision    recall  f1-score   support

           0       0.68      0.95      0.80        41
           1       0.91      0.43      0.59        23
           2       0.91      0.81      0.85        36

    accuracy                           0.78       100
   macro avg       0.83      0.73      0.75       100
weighted avg       0.82      0.78      0.77       100

[[39  0  2]
 [12 10  1]
 [ 6  1 29]]
