In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Load Data
data = pd.read_csv("penguins_lter.csv")

# Clean Data
data.drop(columns=["Delta 15 N (o/oo)", "Body Mass (g)", "Island", "Sex", "Comments", "studyName", "Sample Number", "Individual ID", "Region", "Clutch Completion", "Stage", "Date Egg"], inplace=True)

# Fill Missing Data
num_cols = ["Culmen Length (mm)", "Culmen Depth (mm)", "Flipper Length (mm)", "Delta 13 C (o/oo)"]
for col in num_cols:
    data[col].fillna(data[col].median())

for col in num_cols:
    data[col] = pd.to_numeric(data[col], errors="coerce")

#Encode the Target Label
le = LabelEncoder()
data["Species"] = le.fit_transform(data["Species"])


# Split Data into Features(X) and Target(y)
X = data.drop(columns=["Species"])
y = data["Species"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=66, stratify=y)

# Create and Train the Model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Get feature importances
feature_importances = model.feature_importances_

for feature, importance in zip(X.columns, feature_importances):
    print(f"Feature: {feature}, Importance: {importance}")


# Making Prediction
prediction = model.predict(X_test)

# Evaluate the Model
accuracy = accuracy_score(y_test, prediction)
print("Accuracy:", accuracy)

# Classification Report & Confusion Matrix
print("\nClassification Report:")
print(classification_report(y_test, prediction))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, prediction))

# Cross Validation
cv_scores = cross_val_score(model, X, y, cv=5)
print("Cross Validation Scores:", cv_scores)
print(f"Average CV Score: {cv_scores.mean()}")

# Visualizing the Decision Tree
plt.figure(figsize=(20, 15))
plot_tree(model, filled=True, feature_names=X.columns, class_names=le.classes_, fontsize=12)
plt.show()

# Save the Model
joblib.dump(model, "penguins_decision_tree.joblib")

print(data["Species"].unique())
print(le.classes_)