# Simple classification

In [26]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
data_url =  "https://raw.githubusercontent.com/mcnakhaee/palmerpenguins/refs/heads/master/palmerpenguins/data/penguins.csv"
penguins = pd.read_csv(data_url)
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


## Visually inspecting the data 
What attributes could we use to predict the penguin species? Let's say our task is to divide the penguin species by drawing a straight line through the data. 

In [27]:
def examine_penguins(penguins): 
  attribs = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "species"]

  # Plot the pair grid
  g = sns.PairGrid(penguins[attribs], hue="species")
  g.map_diag(sns.histplot)
  g.map_offdiag(sns.scatterplot)
  g.add_legend()
  return g 

In [42]:

# Don't touch this cell!
def plotSVMResults(X,y, model):

  # Step 1: Create a mesh grid for plotting
  x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
  y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                      np.arange(y_min, y_max, 0.01))

  # Step 2: Predict the class labels for the grid points
  Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
  Z = Z.reshape(xx.shape)

  features = set(y)

  Z_num = Z.copy()
  y_num = y.copy()

  ## Change nominal values to 0 or 1
  for (i,label) in enumerate(features):
    Z_num[Z_num[:,:]==label] = i 
    y_num[y_num[:]==label] = i 

  Z_num = Z_num.astype(int)
  y_num = y_num.astype(int)

  fig, ax = plt.subplots()

  #  Plot the decision boundary and the data points
  ax.contourf(xx, yy, Z_num, alpha=0.8)
  ax.scatter(X[:, 0], X[:, 1], c=y_num, edgecolors='k', marker='o')
  ax.set_xlabel('Feature 1')
  ax.set_ylabel('Feature 2')
  ax.set_title('SVM Decision Boundary with LinearSVC')

  return (ax,fig)
  