# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `species` using the other variables in the dataset.

**Dummify** all variables that require this.

In [10]:
import pandas as pd
from palmerpenguins import load_penguins
penguins = load_penguins()
penguins = penguins.dropna()
penguins["year"] = penguins["year"].astype(str) # ensure year is a categorical variable
penguins = pd.get_dummies(penguins, columns = ["species", "island", "sex"], drop_first = False)
penguins.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,sex_female,sex_male
0,39.1,18.7,181.0,3750.0,2007,1,0,0,0,0,1,0,1
1,39.5,17.4,186.0,3800.0,2007,1,0,0,0,0,1,1,0
2,40.3,18.0,195.0,3250.0,2007,1,0,0,0,0,1,1,0
4,36.7,19.3,193.0,3450.0,2007,1,0,0,0,0,1,1,0
5,39.3,20.6,190.0,3650.0,2007,1,0,0,0,0,1,0,1


Let's use the other variables to predict `species`. Prepare your data and fit the following models on the entire dataset:

* Two kNN models (for different values of K)
* Two decision tree models (for different complexities of trees)

Compute the following, for each of your models, on test data. Keep in mind that you may need to stratify your creation of the training and test data.

* Confusion matrix
* Overall Accuracy
* Precision, Recall, AUC, and F1-score for each species

Create one ROC plot for the species of your choice.

In [110]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

In [129]:
pen = load_penguins()
pen["year"] = pen["year"].astype(str) # ensure year is a categorical variable
pen = pen.dropna()

X = pen.drop(["species"], axis = 1) # drop bc identification variables/response variable
y = pen["species"]

# Split into test and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # Split into testing and validation

# KNN model with 3 neighbors

In [146]:
ct = ColumnTransformer(
  [
    ("dummify", 
    OneHotEncoder(sparse_output = False, handle_unknown='ignore'), # do not predict new/unknown categories
    make_column_selector(dtype_include=object)),
    ("standardize", 
    StandardScaler(), 
    make_column_selector(dtype_include=np.number))
  ],
  remainder = "passthrough"
)

knnPipeline = Pipeline(
  [("preprocessing", ct),
  ("knn_model", KNeighborsClassifier(n_neighbors = 3))]
)

knn_model_fitted = knnPipeline.fit(X,y)
knn_model_fitted

In [147]:
# Accuracy Score
y_test_pred = knnPipeline.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred) 

0.9880952380952381

In [148]:
# Confusion Matrix
confusion_matrix(y_test, y_test_pred)

array([[35,  1,  0],
       [ 0, 19,  0],
       [ 0,  0, 29]])

In [149]:
# Precision, Recall, F1-score
classification_report(y_test, y_test_pred)

'              precision    recall  f1-score   support\n\n      Adelie       1.00      0.97      0.99        36\n   Chinstrap       0.95      1.00      0.97        19\n      Gentoo       1.00      1.00      1.00        29\n\n    accuracy                           0.99        84\n   macro avg       0.98      0.99      0.99        84\nweighted avg       0.99      0.99      0.99        84\n'

In [150]:
# Compute AUC-ROC score
y_prob = decisiontreePipeline.predict_proba(X_test)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9955022036370343

# KNN model with 12 neighbors

In [141]:
knnPipeline2 = Pipeline(
  [("preprocessing", ct),
  ("knn_model", KNeighborsClassifier(n_neighbors = 12))]
)

knn_model_fitted2 = knnPipeline2.fit(X,y)
knn_model_fitted2

In [142]:
# Accuracy Score
y_test_pred = knnPipeline2.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred) 

0.9761904761904762

In [143]:
# Confusion Matrix
confusion_matrix(y_test, y_test_pred)

array([[35,  1,  0],
       [ 1, 18,  0],
       [ 0,  0, 29]])

In [144]:
# Precision, Recall, F1-score
classification_report(y_test, y_test_pred)

'              precision    recall  f1-score   support\n\n      Adelie       0.97      0.97      0.97        36\n   Chinstrap       0.95      0.95      0.95        19\n      Gentoo       1.00      1.00      1.00        29\n\n    accuracy                           0.98        84\n   macro avg       0.97      0.97      0.97        84\nweighted avg       0.98      0.98      0.98        84\n'

In [145]:
# Compute AUC-ROC score
y_prob = decisiontreePipeline.predict_proba(X_test)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9955022036370343

# Decision tree with minimum 10 samples per leaf

In [18]:
decisiontreePipeline = Pipeline(
  [("preprocessing", ct),
  ("decision_tree_model", DecisionTreeClassifier(min_samples_leaf = 10))]
)

decision_tree_model_fitted = decisiontreePipeline.fit(X,y)
decision_tree_model_fitted

In [44]:
# Accuracy Score
y_test_pred = decisiontreePipeline.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred) 

0.9642857142857143

In [45]:
# Confusion Matrix
confusion_matrix(y_test, y_test_pred)

array([[34,  1,  0],
       [ 2, 16,  0],
       [ 0,  0, 31]])

In [48]:
# Precision, Recall, F1-score
classification_report(y_test, y_test_pred)

'              precision    recall  f1-score   support\n\n      Adelie       0.94      0.97      0.96        35\n   Chinstrap       0.94      0.89      0.91        18\n      Gentoo       1.00      1.00      1.00        31\n\n    accuracy                           0.96        84\n   macro avg       0.96      0.95      0.96        84\nweighted avg       0.96      0.96      0.96        84\n'

In [47]:
# Compute AUC-ROC score
from sklearn.metrics import roc_auc_score
y_prob = decisiontreePipeline.predict_proba(X_test)
roc_auc_score(y_test, y_prob, multi_class='ovr')

0.9981433217827775

# Decision tree with minimum 3 samples per leaf

In [56]:
decisiontreePipeline2 = Pipeline(
  [("preprocessing", ct),
  ("decision_tree_model", DecisionTreeClassifier(min_samples_leaf = 3))]
)

decision_tree_model_fitted2 = decisiontreePipeline2.fit(X,y)
decision_tree_model_fitted2

In [57]:
# Accuracy Score
y_test_pred = decisiontreePipeline2.predict(X_test)
metrics.accuracy_score(y_test, y_test_pred) 

1.0

In [58]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_pred)

array([[35,  0,  0],
       [ 0, 18,  0],
       [ 0,  0, 31]])

In [59]:
# Precision, Recall, F1-score
from sklearn.metrics import classification_report
classification_report(y_test, y_test_pred)

'              precision    recall  f1-score   support\n\n      Adelie       1.00      1.00      1.00        35\n   Chinstrap       1.00      1.00      1.00        18\n      Gentoo       1.00      1.00      1.00        31\n\n    accuracy                           1.00        84\n   macro avg       1.00      1.00      1.00        84\nweighted avg       1.00      1.00      1.00        84\n'

In [60]:
# Compute AUC-ROC score
from sklearn.metrics import roc_auc_score
y_prob = decisiontreePipeline2.predict_proba(X_test)
roc_auc_score(y_test, y_prob, multi_class='ovr')

1.0

# Plot AUC curve for Decision Tree with 10 minimum samples per leaf

In [155]:
import matplotlib.pyplot as plt
#from sklearn.metrics import plot_roc_curve

#plot_roc_curve(knnPipeline.named_steps['knn_model'], X_test, y_test) 
#plt.show()

#probs = model.predict_proba(X_test)
preds = y_prob[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

ValueError: multiclass format is not supported