# Model Deployment using flask

### Importing required libraries

In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Loading the dataset

In [50]:
penguinsData = pd.read_csv("data/palmerPenguins.csv")
penguinsData.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


### Cleaning the Data

In [51]:
penguinsData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [52]:
penguinsData.isna().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [53]:
penguinsData.loc[penguinsData["sex"].isnull()]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
3,Adelie,Torgersen,,,,,
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,
10,Adelie,Torgersen,37.8,17.1,186.0,3300.0,
11,Adelie,Torgersen,37.8,17.3,180.0,3700.0,
47,Adelie,Dream,37.5,18.9,179.0,2975.0,
246,Gentoo,Biscoe,44.5,14.3,216.0,4100.0,
286,Gentoo,Biscoe,46.2,14.4,214.0,4650.0,
324,Gentoo,Biscoe,47.3,13.8,216.0,4725.0,
339,Gentoo,Biscoe,,,,,


We drop all the rows where sex is null

In [54]:
penguinsData = penguinsData.dropna()
penguinsData.isna().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [55]:
penguinsData["sex"].unique()

array(['MALE', 'FEMALE', '.'], dtype=object)

Column sex has an unwanted values. It has to be removed.

In [56]:
penguinsData.loc[penguinsData["sex"]=="."]

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
336,Gentoo,Biscoe,44.5,15.7,217.0,4875.0,.


In [57]:
penguinsData = penguinsData.drop(336)

### Building a Random Forest Classifier for predicting penguin species

In [58]:
X = penguinsData.drop("species",axis=1)
y = penguinsData["species"]

#### Creating Dummy variables for island and sex

In [60]:
X = pd.get_dummies(X,drop_first=True)

#### Splitting model into train and test set

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,test_size=0.3)

#### Random Forest Model

In [63]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

RandomForestClassifier()

#### Checking Model performance

In [64]:
y_pred = rfc.predict(X_test)
accuracyScore = accuracy_score(y_pred,y_test)
print("Accuracy of the Model: ",accuracyScore)

Accuracy of the Model:  0.99


In [65]:
confusionMatrix = confusion_matrix(y_pred,y_test)
print("Confusion Matrix: \n", confusionMatrix)

Confusion Matrix: 
 [[48  1  0]
 [ 0 17  0]
 [ 0  0 34]]


## Creating a Pickle file

In [67]:
pickle.dump(rfc, open("penguinsRFCModel.pkl","wb"))