In [1]:
#using the palmerpenguins data set to train a model to predict the species based on phenotypes

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import SGDClassifier  #uses Stochastic Gradient Descent for training for classification tasks
from sklearn.preprocessing import StandardScaler #for scaling or normalizing data
from sklearn.model_selection import train_test_split  #splits dataset into training and testing sets
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix #metric to evaluate performance by comparing predicted values (y_pred) and true values (y_test)

In [2]:
## reads data and checks for rows with empty points
penguins = sns.load_dataset("penguins")
print(penguins.isna().sum())
### deletes rows with empty points ### 
penguins.dropna(inplace=True)
print(penguins.isna().sum())

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [3]:
### numpy arrays für for x and y ###
species = penguins['species']
y = np.array(species)
values = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
x = np.array(values)
print(x)

[[  39.1   18.7  181.  3750. ]
 [  39.5   17.4  186.  3800. ]
 [  40.3   18.   195.  3250. ]
 ...
 [  50.4   15.7  222.  5750. ]
 [  45.2   14.8  212.  5200. ]
 [  49.9   16.1  213.  5400. ]]


In [4]:
### scaling data so that all have zero mean and unit variance###
scaler = StandardScaler() #initializes StandardScaler object
x= scaler.fit_transform(x) #applies scaling (standardizes data) to x
print(type(x))
print(x)

<class 'numpy.ndarray'>
[[-0.89604189  0.7807321  -1.42675157 -0.56847478]
 [-0.82278787  0.11958397 -1.06947358 -0.50628618]
 [-0.67627982  0.42472926 -0.42637319 -1.1903608 ]
 ...
 [ 1.17338426 -0.74499437  1.50292796  1.91906927]
 [ 0.22108196 -1.20271231  0.78837197  1.23499466]
 [ 1.08181673 -0.54156417  0.85982757  1.48374906]]


In [5]:
### splitting into training and testing data ###
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) #0.2 -> 20% for testing data, 80% for training, 42 -> rng seed to see if split reproducible
print(x_train)

[[-0.56639879 -1.76214535  0.93128317  0.55092004]
 [-1.22568499  0.32301416 -0.71219559 -1.06598359]
 [ 1.10013024 -0.64327927  1.64583915  1.42156046]
 ...
 [-0.78616086  0.27215661 -0.56928439 -1.2525494 ]
 [ 0.27602248 -1.10099721  1.36001676  0.98624025]
 [-1.07917695 -0.08384623 -1.42675157 -1.2836437 ]]


In [6]:
### scikit-learn classifier initialize and training ###
clf = SGDClassifier(loss='hinge', random_state=42) 
clf.fit(x_train, y_train) #trains model clf using x and y train

In [7]:
### predictions on test data ###
y_pred = clf.predict(x_test) #y_pred will includ predicitons on test data x_test)

In [8]:
### test results ###
print("Accuracy:", accuracy_score(y_test, y_pred)) #compares true with test and calculates accuracy
print(classification_report(y_test, y_pred))  #several key indicators 
print(confusion_matrix(y_test, y_pred))  #shows true predictions with rows being true species and columns being predicted species

Accuracy: 1.0
              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        31
   Chinstrap       1.00      1.00      1.00        13
      Gentoo       1.00      1.00      1.00        23

    accuracy                           1.00        67
   macro avg       1.00      1.00      1.00        67
weighted avg       1.00      1.00      1.00        67

[[31  0  0]
 [ 0 13  0]
 [ 0  0 23]]


In [9]:
### testing specific data points ###
z = np.array([[50.4,   15.7,  222.0,  5750.0]]) # testing (341, Gentoo)
#z = np.array([[39.1,   18.7,  181.0,  3750.0]]) # testing (0, Adelie)
#z = np.array([[51.3, 19.9, 198.0, 3700.0]])   # testing (Chinstrap)
z = scaler.transform(z)                      # new data points are scaled to match training data
print(z)
y_pred = clf.predict(z) #predicts species for the data points in z
print(y_pred)

[[ 1.17338426 -0.74499437  1.50292796  1.91906927]]
['Gentoo']
