# Palmer Penguins Modeling

Import the Palmer Penguins dataset and print out the first few rows.

Suppose we want to predict `bill_depth_mm` using the other variables in the dataset.

**Dummify** all variables that require this.

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_squared_error

In [None]:
# Code Here
!pip install palmerpenguins

Collecting palmerpenguins
  Downloading palmerpenguins-0.1.4-py3-none-any.whl.metadata (2.0 kB)
Downloading palmerpenguins-0.1.4-py3-none-any.whl (17 kB)
Installing collected packages: palmerpenguins
Successfully installed palmerpenguins-0.1.4


In [None]:
from palmerpenguins import load_penguins

penguins = load_penguins()

penguins = penguins.dropna()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007


Let's use the other variables to predict `bill_depth_mm`. Prepare your data and fit the following models on a training dataset subset of the entire dataset:

* Four different models, each containing a different set of predictor variables

Create a plot like the right plot of Fig 1. in our `Model Validation` chapter with the training and test error plotted for each of your four models.

Which of your models was best?

In [None]:
# Dummify variables

ct = make_column_transformer(
    (OneHotEncoder(), ['species','sex','island']),
    remainder="passthrough"  # all other columns in X will be passed through
)

lr = LinearRegression()

# Creating Pipeline
pipeline = Pipeline(
    [('dummify_everything', ct), ('ols', lr)]
)

In [None]:
# Model 1
x1 = penguins[["bill_length_mm", "species", "island", "sex"]]
y = penguins["bill_depth_mm"]

# Split data into training and testing sets for Model 1
X_train1, X_test1, y_train1, y_test1 = train_test_split(x1, y, test_size=0.25, random_state=42)

# Fitting Pipeline for Model 1
pipeline.fit(X_train1, y_train1)

# Calculate model predictions on the training data for Model 1
y_train1_pred = pipeline.predict(X_train1)

# Calculate RMSE of Training Data for Model 1
mse_train1 = mean_squared_error(y_train1, y_train1_pred)
rmse_train1 = np.sqrt(mse_train1)
print("Model 1 - Training RMSE:", rmse_train1)

# Calculate RMSE of Validation Set for Model 1
y_test1_pred = pipeline.predict(X_test1)
rmse_val1 = np.sqrt(mean_squared_error(y_test1, y_test1_pred))
print("Model 1 - Validation RMSE:", rmse_val1)

Model 1 - Training RMSE: 0.7583618254409685
Model 1 - Validation RMSE: 0.962915532536473


In [None]:
# Model 2
x2 = penguins[["bill_length_mm", "species", "island", "sex", "year"]]
y = penguins["bill_depth_mm"]

# Split data into training and testing sets for Model 2
X_train2, X_test2, y_train2, y_test2 = train_test_split(x2, y, test_size=0.25, random_state=42)

# Fitting Pipeline for Model 2
pipeline.fit(X_train2, y_train2)

# Calculate model predictions on the training data for Model 2
y_train2_pred = pipeline.predict(X_train2)

# Calculate RMSE of Training Data for Model 2
mse_train2 = mean_squared_error(y_train2, y_train2_pred)
rmse_train2 = np.sqrt(mse_train2)
print("Model 2 - Training RMSE:", rmse_train2)
# Calculate RMSE of Validation Set for Model 2
y_test2_pred = pipeline.predict(X_test2)
rmse_val2 = np.sqrt(mean_squared_error(y_test2, y_test2_pred))
print("Model 2 - Validation RMSE:", rmse_val2)

Model 2 - Training RMSE: 0.7552581962277672
Model 2 - Validation RMSE: 0.9590935029048298


In [None]:
# Model 3
x3 = penguins[["bill_length_mm", "species", "island", "sex", "year", "body_mass_g"]]
y = penguins["bill_depth_mm"]

# Split data into training and testing sets for Model 3
X_train3, X_test3, y_train3, y_test3 = train_test_split(x3, y, test_size=0.25, random_state=42)

# Fitting Pipeline for Model 3
pipeline.fit(X_train3, y_train3)

# Calculate model predictions on the training data for Model 3
y_train3_pred = pipeline.predict(X_train3)

# Calculate RMSE of Training Data for Model 3
mse_train3 = mean_squared_error(y_train3, y_train3_pred)
rmse_train3 = np.sqrt(mse_train3)
print("Model 3 - Training RMSE:", rmse_train3)

# Calculate RMSE of Validation Set for Model 3
y_test3_pred = pipeline.predict(X_test3)
rmse_val3 = np.sqrt(mean_squared_error(y_test3, y_test3_pred))
print("Model 3 - Validation RMSE:", rmse_val3)


Model 3 - Training RMSE: 0.7380791244337395
Model 3 - Validation RMSE: 0.9232080727055915


In [None]:
# Model 4
x4 = penguins[["bill_length_mm", "species", "island", "sex", "year", "body_mass_g", "flipper_length_mm"]]
y = penguins["bill_depth_mm"]

# Split data into training and testing sets for Model 4
X_train4, X_test4, y_train4, y_test4 = train_test_split(x4, y, test_size=0.25, random_state=42)

# Fitting Pipeline for Model 4
pipeline.fit(X_train4, y_train4)

# Calculate model predictions on the training data for Model 4
y_train4_pred = pipeline.predict(X_train4)

# Calculate RMSE of Training Data for Model 4
mse_train4 = mean_squared_error(y_train4, y_train4_pred)
rmse_train4 = np.sqrt(mse_train4)
print("Model 4 - Training RMSE:", rmse_train4)

# Calculate RMSE of Validation Set for Model 4
y_test4_pred = pipeline.predict(X_test4)
rmse_val4 = np.sqrt(mean_squared_error(y_test4, y_test4_pred))
print("Model 4 - Validation RMSE:", rmse_val4)

Model 4 - Training RMSE: 0.732353508933187
Model 4 - Validation RMSE: 0.9014169098209783
