In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import sqlite3

## Penguins database

In [15]:
penguins = sns.load_dataset("penguins").dropna()
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


In [16]:
# Create a connection to SQLite database
conn = sqlite3.connect("penguins.db")
cursor = conn.cursor()

# Create tables
cursor.execute('''
    CREATE TABLE IF NOT EXISTS island (
        island_id INTEGER PRIMARY KEY AUTOINCREMENT,
        island_name TEXT UNIQUE
    )
''')

cursor.execute('''
    CREATE TABLE IF NOT EXISTS penguins (
        penguin_id INTEGER PRIMARY KEY AUTOINCREMENT,
        species TEXT,
        island_id INTEGER,
        bill_length_mm REAL,
        bill_depth_mm REAL,
        flipper_length_mm INTEGER,
        body_mass_g INTEGER,
        sex TEXT,
        FOREIGN KEY (island_id) REFERENCES island (island_id)
    )
''')

<sqlite3.Cursor at 0x18503a4f4c0>

In [17]:
# Insert islands to db
islands = penguins[['island']].drop_duplicates().reset_index(drop=True)
islands.to_sql("island", conn, if_exists="replace", index=False)

islands['island_id'] = islands.index + 1
# Merge island_id into penguins dataset
penguins = penguins.merge(islands, left_on='island', right_on='island')
penguins = penguins.drop(columns=['island'])

# Insert data into penguins table
penguins.to_sql("penguins", conn, if_exists="replace", index=False)

# Commit and close connection
conn.commit()
conn.close()

## Features selection

#### Read data from db


In [47]:
conn = sqlite3.connect("penguins.db")
penguins_df = pd.read_sql("SELECT * FROM penguins", conn)
islands_df = pd.read_sql("SELECT * FROM island", conn)
df = penguins_df.merge(islands_df, left_on='island_id', right_on=islands_df.index)
df = df.drop(columns=['island_id'])

In [21]:
df

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,island
0,Adelie,39.1,18.7,181.0,3750.0,Male,Biscoe
1,Adelie,39.5,17.4,186.0,3800.0,Female,Biscoe
2,Adelie,40.3,18.0,195.0,3250.0,Female,Biscoe
3,Adelie,36.7,19.3,193.0,3450.0,Female,Biscoe
4,Adelie,39.3,20.6,190.0,3650.0,Male,Biscoe
...,...,...,...,...,...,...,...
205,Gentoo,47.2,13.7,214.0,4925.0,Female,Dream
206,Gentoo,46.8,14.3,215.0,4850.0,Female,Dream
207,Gentoo,50.4,15.7,222.0,5750.0,Male,Dream
208,Gentoo,45.2,14.8,212.0,5200.0,Female,Dream


#### Correlation among numeric features

In [24]:
## correlation matrix
numeric_df = df.select_dtypes(include=[np.number])
numeric_df.corr()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
bill_length_mm,1.0,-0.533375,0.873929,0.871643
bill_depth_mm,-0.533375,1.0,-0.625883,-0.475707
flipper_length_mm,0.873929,-0.625883,1.0,0.889811
body_mass_g,0.871643,-0.475707,0.889811,1.0


**Flipper Length, Bill Length, Body Mass (Highly correlated):**

- Bill Length - Flipper Length (0.874)
- Bill Length - Body Mass (0.872)
- Flipper Length - Body Mass (0.890)

-> Only keep one to reduce complexity, choose Flipper Length

In [25]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df["island"], df["species"])

# Perform Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print results
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")


Chi-square statistic: 76.2384
P-value: 0.0000


**Low p-value**
Island is significantly related to species --> Keep island as a feature

In [48]:
df = df.drop(columns=['bill_length_mm', 'body_mass_g', 'sex'])

In [49]:
df.head(2)

Unnamed: 0,species,bill_depth_mm,flipper_length_mm,island
0,Adelie,18.7,181.0,Biscoe
1,Adelie,17.4,186.0,Biscoe


## Model Training

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

X = df.drop('species', axis=1)
y = df['species']

# Define which columns are numerical and categorical
num_feats = ['bill_depth_mm', 'flipper_length_mm']
cat_feats = ['island']

# Scaler
scaler = StandardScaler()
X_num = X[num_feats].copy()
X_num_scaled = scaler.fit_transform(X_num)

# # OneHotEncoder
# encoder = OneHotEncoder(handle_unknown='ignore')
# X_cat = X[cat_feats].copy()
# X_cat_encoded = encoder.fit_transform(X_cat).toarray()

# # Combine the preprocessed data
# X_processed = np.hstack((X_num_scaled, X_cat_encoded))


# Split
X_train, X_test, y_train, y_test = train_test_split(X_num_scaled, y, test_size=0.25, random_state=42)

# Create and train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))



Model Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        19
      Gentoo       1.00      1.00      1.00        34

    accuracy                           1.00        53
   macro avg       1.00      1.00      1.00        53
weighted avg       1.00      1.00      1.00        53



In [51]:
# Save the model, encoder, and scaler
joblib.dump(clf, 'penguin_classifier.joblib')
joblib.dump(scaler, 'penguin_scaler.joblib')
# joblib.dump(encoder, 'penguin_encoder.joblib')

['penguin_scaler.joblib']