<a href="https://colab.research.google.com/github/michaeledge27/CSCI290/blob/main/notebooks/PalmerPenguinsProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


In [3]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [4]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [5]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


In [6]:
penguins.columns

Index(['species', 'island', 'bill_length_mm', 'bill_depth_mm',
       'flipper_length_mm', 'body_mass_g', 'sex', 'year'],
      dtype='object')

In [7]:
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# k-NN Classifier


In [8]:
from collections import Counter

In [9]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

In [20]:
def get_k_neighbors(X_train, y_train, x, k):
  distances = []
  for i in range(len(X_train)):
    distance = euclidean_distance(X_train.iloc[i], x)
    distances.append((distance, y_train.iloc[i]))

  distances.sort(key=lambda x: x[0])  # Sort by distance (the first element of the tuple)
  k_nearest_labels = [label for _, label in distances[:k]]
  return k_nearest_labels

In [30]:
def predict(X_train, y_train, X_test, k):
    predictions = []
    for i in range(len(X_test)):
        k_nearest_labels = get_k_neighbors(X_train, y_train, X_test.iloc[i], k)
        # Get the most common label directly
        most_common = Counter(k_nearest_labels).most_common(1)[0][0]
        predictions.append(most_common)
    return np.array(predictions)

In [31]:

predictions = predict(X_train, y_train, X_test, k=3)
print(predictions)

['Adelie' 'Adelie' 'Gentoo' 'Adelie' 'Gentoo' 'Gentoo' 'Adelie' 'Gentoo'
 'Gentoo' 'Adelie' 'Adelie' 'Adelie' 'Gentoo' 'Adelie' 'Adelie' 'Adelie'
 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Gentoo' 'Adelie' 'Adelie'
 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Gentoo'
 'Adelie' 'Adelie' 'Gentoo' 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Adelie'
 'Gentoo' 'Adelie' 'Gentoo' 'Gentoo' 'Adelie' 'Gentoo' 'Adelie' 'Adelie'
 'Adelie' 'Adelie' 'Adelie' 'Gentoo' 'Gentoo' 'Gentoo' 'Adelie' 'Adelie'
 'Adelie' 'Adelie' 'Gentoo' 'Adelie' 'Gentoo' 'Adelie' 'Adelie' 'Gentoo'
 'Adelie' 'Gentoo' 'Adelie' 'Gentoo' 'Gentoo']


# Logistic Regression and SVM

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']
imputer = SimpleImputer(strategy='mean')

X_imputed = imputer.fit_transform(X)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [None]:
svc_clf = SVC(kernel="linear", C=5).fit(X_scaled, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)


In [None]:

cm = confusion_matrix( y_test, y_pred )
cm

array([[32,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 24]])

In [None]:
accuracy_score(y_test, y_pred)
recall_score(y_test, y_pred, average='macro')
precision_score(y_test, y_pred, average='macro')
f1_score(y_test, y_pred, average='macro')


1.0

In [None]:
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_svm)

1.0

In [None]:
cm = confusion_matrix( y_test, y_pred_svm )
cm

array([[32,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 24]])

# Softmax Regression and SGD

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']
imputer = SimpleImputer(strategy='mean')

X_imputed = imputer.fit_transform(X)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [None]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_scaled, y)

In [None]:
sgd_predictions = sgd_clf.predict(X_scaled)

In [None]:
cm = confusion_matrix( y, sgd_predictions )
cm

array([[148,   2,   2],
       [  0,  68,   0],
       [  0,   0, 124]])

In [None]:
accuracy_score(y, sgd_predictions)
recall_score(y, sgd_predictions, average='macro')
precision_score(y, sgd_predictions, average='macro')
f1_score(y, sgd_predictions, average='macro')

0.9880579710144928

In [None]:
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10, random_state=42)
softmax_reg.fit(X_scaled, y)
y_pred = softmax_reg.predict(X_scaled)



In [None]:
accuracy_score(y, y_pred)

0.997093023255814

In [None]:
confusion_matrix(y, y_pred)

array([[152,   0,   0],
       [  0,  68,   0],
       [  1,   0, 123]])