## Step-1: Business Problem
- Create a predictive model that can help predict a species of a penguin based on physical attributes, then we can use that model to help researchers classify penguins in the field, instead of needing an experienced biologist

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter('ignore')

In [None]:
df=pd.read_csv('penguins_size.csv')
df.head()

In [None]:
df.info()

## Step-3: Data Preprocessing


#### Data Cleaning

In [None]:
df['sex'].replace(".","MALE",inplace=True)

In [None]:
df.isna().sum()/len(df)*100

In [None]:
#Coninuous variable with mean
#Discrete variable with mode

df['culmen_length_mm'].fillna(df['culmen_length_mm'].mean(),inplace=True)
df['culmen_depth_mm'].fillna(df['culmen_depth_mm'].mean(),inplace=True)
df['flipper_length_mm'].fillna(df['flipper_length_mm'].mean(),inplace=True)
df['body_mass_g'].fillna(df['body_mass_g'].mean(),inplace=True)
df['sex'].fillna(df['sex'].mode()[0],inplace=True)

#### Encoding

In [34]:
X=pd.get_dummies(df.drop("species",axis=1),drop_first=True)
y=df['species']

In [36]:
X

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island_Dream,island_Torgersen,sex_MALE
0,39.10000,18.70000,181.000000,3750.000000,0,1,1
1,39.50000,17.40000,186.000000,3800.000000,0,1,0
2,40.30000,18.00000,195.000000,3250.000000,0,1,0
3,43.92193,17.15117,200.915205,4201.754386,0,1,1
4,36.70000,19.30000,193.000000,3450.000000,0,1,0
...,...,...,...,...,...,...,...
339,43.92193,17.15117,200.915205,4201.754386,0,0,1
340,46.80000,14.30000,215.000000,4850.000000,0,0,0
341,50.40000,15.70000,222.000000,5750.000000,0,0,1
342,45.20000,14.80000,212.000000,5200.000000,0,0,0


### Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,
                                                 random_state=6)

## Step-4,5: Modelling & Evaluation
#### Decision Tree Classfier-with default Hyperparameters

In [None]:
#Modelling
from sklearn.ensemble import RandomForestClassifier
rf_default_model=RandomForestClassifier(random_state=6)
rf_default_model.fit(X_train,y_train)


#Prediction and Eval on Train
ypred_train = rf_default_model.predict(X_train)
from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train,ypred_train))


#CV Score
from sklearn.model_selection import cross_val_score
print("CV Score:",cross_val_score(rf_default_model,X,y,cv=5).mean())

#Predict and Eval on Test Data
ypred_test = rf_default_model.predict(X_test)
print("Test Accuracy:",accuracy_score(ypred_test,y_test))


#### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

estimator=RandomForestClassifier(random_state=0)

param_grid={'n_estimators':list(range(1,101))}

grid=GridSearchCV(estimator,param_grid,scoring='accuracy',cv=5)
grid.fit(X_train,y_train)
grid.best_params_

In [None]:
grid.best_estimator_.feature_importances_

In [None]:
feats=pd.DataFrame(data=grid.best_estimator_.feature_importances_,
                  index=X.columns,columns=['Feature Importance'])
feats

In [None]:
feats_imp=feats[feats['Feature Importance']>0]
feats_imp

In [None]:
importance_features_list=feats_imp.index.to_list()
importance_features_list

## Final Decision Tree Model
#### with best hyperparameters & with important features

In [None]:
X_imp=X[importance_features_list]

X_train,X_test,y_train,y_test = train_test_split(X_imp,y,train_size=0.8,
                                                 random_state=6)

final_rf_model=RandomForestClassifier(n_estimators=12,random_state=0)
final_rf_model.fit(X_train,y_train)


#Prediction and Eval on Train
ypred_train = final_rf_model.predict(X_train)
from sklearn.metrics import accuracy_score
print("Train Accuracy:",accuracy_score(y_train,ypred_train))


#CV Score
from sklearn.model_selection import cross_val_score
print("CV Score:",cross_val_score(final_rf_model,X_imp,y,cv=5).mean())

#Predict and Eval on Test Data
ypred_test = final_rf_model.predict(X_test)
print("Test Accuracy:",accuracy_score(ypred_test,y_test))


## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,ypred_test)