In [27]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
data = pd.read_csv('data/penguins_size.csv')

In [15]:
data = data.dropna(ignore_index=True)

In [16]:
target = data['species']
features = data.drop(columns=['species'])

In [None]:
categorical_columns = ['island', 'sex']
numerical_columns = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']

data

In [None]:
data.columns

In [None]:
encoder = OneHotEncoder(sparse_output=False)
encoder.fit(features[categorical_columns])
encoded_array = encoder.transform(features[categorical_columns])
encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_columns), index=features.index)

features = features.drop(columns=categorical_columns).join(encoded_df)

In [None]:
features

In [None]:
lr = LogisticRegression()
lr.fit(features, target)

In [None]:
predictions = lr.predict(features)

In [None]:
accuracy_score(target, predictions)

In [None]:
precision_score(target, predictions, average='micro')

In [None]:
recall_score(target, predictions, average='micro')

In [None]:
conf_matrix = confusion_matrix(target, predictions)


# Plot the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=lr.classes_, yticklabels=lr.classes_)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()