In [1]:
import pandas as pd
from palmerpenguins import load_penguins

  import pkg_resources


## Original dataset

In [2]:
# load dataset
penguins = load_penguins()
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [3]:
# observe shape
rows, cols = penguins.shape
print(f"There are {rows} observations and {cols} features in the original dataset.")

There are 344 observations and 8 features in the original dataset.


## Separate predictor and target variables

In [4]:
predictors = penguins.drop(columns=['species']).copy()
target = penguins['species'].copy()

print(f"Shape of predictors: {predictors.shape}")
print(f"Shape of target: {target.shape}")

Shape of predictors: (344, 7)
Shape of target: (344,)


## Construct a simple version of `penguins`

- keep numerical variables of `bill_length_mm`, `bill_depth_mm`, `flipper_length_mm`, `body_mass_g`
- drop observations with missing value

In [5]:
# select columns
selected_columns = ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
predictors_simple = predictors[selected_columns].copy()

print(f"Shape of simple predictors: {predictors_simple.shape}")

Shape of simple predictors: (344, 4)


In [6]:
# drop records containing missing values
predictors_simple.dropna(inplace=True)

print(f"Shape of simple predictors after dropping missing values: {predictors_simple.shape}")

Shape of simple predictors after dropping missing values: (342, 4)


In [7]:
predictors_simple

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
4,36.7,19.3,193.0,3450.0
5,39.3,20.6,190.0,3650.0
...,...,...,...,...
339,55.8,19.8,207.0,4000.0
340,43.5,18.1,202.0,3400.0
341,49.6,18.2,193.0,3775.0
342,50.8,19.0,210.0,4100.0


In [8]:
# statistics
predictors_simple.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


### Standardize the dataset

In [9]:
# standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(predictors_simple)

predictors_simple_scaled = pd.DataFrame(
    data=scaler.transform(predictors_simple),
    columns=predictors_simple.columns
)

In [10]:
predictors_simple_scaled

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,-0.884499,0.785449,-1.418347,-0.564142
1,-0.811126,0.126188,-1.062250,-0.501703
2,-0.664380,0.430462,-0.421277,-1.188532
3,-1.324737,1.089724,-0.563715,-0.938776
4,-0.847812,1.748985,-0.777373,-0.689020
...,...,...,...,...
337,2.178824,1.343286,0.433355,-0.251947
338,-0.077396,0.481175,0.077258,-1.001215
339,1.041543,0.531887,-0.563715,-0.532923
340,1.261662,0.937586,0.647013,-0.127069


In [11]:
# statistics
predictors_simple_scaled.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,1.662088e-16,4.155221e-16,-8.310441e-16,8.310441e-17
std,1.001465,1.001465,1.001465,1.001465
min,-2.168526,-2.054446,-2.05932,-1.875362
25%,-0.8615697,-0.7866355,-0.7773731,-0.8138982
50%,0.09686524,0.07547549,-0.2788381,-0.1895079
75%,0.839767,0.7854492,0.8606705,0.6846384
max,2.875868,2.205397,2.142618,2.620248


### Export the dataset

Export both unscaled and scaled versions of the predictors as `X.csv` and `X_scaled.csv`, along with the target values as `y.csv`.

In [13]:
predictors_simple.to_csv("X.csv")
predictors_simple_scaled.to_csv("X_scaled.csv")
target.to_csv("y.csv")