#### Change label to binary, Prevent Data leakage

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv("/Users/iseunghyeon/Desktop/4-1/MLops/aac_intakes_outcomes.csv")
df.head()

# Create binary classification target: Adopted (1), Others (0)
df['adopted'] = df['outcome_type'].apply(lambda x: 1 if x == 'Adoption' else 0)


# prevent Data leakage and remove unnecessary ID
drop_cols = ['outcome_type', 'outcome_subtype', 'animal_id_outcome', 'animal_id_intake',
             'outcome_datetime', 'outcome_monthyear', 'time_in_shelter']  

df = df.drop(columns=[col for col in drop_cols if col in df.columns])

# Separate features and labels
X = df.drop(columns='adopted')
y = df['adopted']

# 학습/테스트 데이터 분리 (예: 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


#### Check class ratio

In [36]:
# 클래스별 비율 확인
class_ratios = y.value_counts(normalize=True) * 100
print("\n📊 클래스별 비율 (%):\n", class_ratios.round(2))

import matplotlib.pyplot as plt

# 막대 그래프로 시각화
class_counts.plot(kind='bar', color='skyblue')
plt.title("Outcome Type 분포")
plt.xlabel("Outcome Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


📊 클래스별 비율 (%):
 adopted
0    57.83
1    42.17
Name: proportion, dtype: float64




#### AutoML : MLJAR
- decision tree
- Random Forest
- Xgboost
- Catboost

In [40]:
from supervised.automl import AutoML

automl = AutoML(
    mode="Compete",                  
    eval_metric="f1",               
    total_time_limit=1000,           
    results_path="MLJAR_AAC",       
    algorithms=[                  
        "Decision Tree",
        "Random Forest",
        "Xgboost",
        "CatBoost"
    ]
)

automl.fit(X_train, y_train)

AutoML directory: MLJAR_AAC
The task is binary_classification with evaluation metric f1
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree f1 0.808078 trained in 4.08 seconds
Disable stacking for split validation
* Step simple_algorithms will try to check up to 2 models
2_DecisionTree f1 0.810104 trained in 4.29 seconds
3_DecisionTree f1 0.810104 trained in 4.23 seconds
* Step default_algorithms will try to check up to 3 models
4_Default_Xgboost f1 0.869078 trained in 8.81 seconds
5_Default_CatBoost f1 0.878013 trained in 12.73 s



7_Xgboost_categorical_mix f1 0.869282 trained in 8.35 seconds
* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: time_in_shelter_days_multiply_outcome_hour
Add Golden Feature: time_in_shelter_days_sum_outcome_hour
Add Golden Feature: time_in_shelter_days_ratio_outcome_hour
Add Golden Feature: outcome_hour_ratio_time_in_shelter_days
Add Golden Feature: time_in_shelter_days_multiply_intake_number
Add Golden Feature: time_in_shelter_days_multiply_outcome_number
Add Golden Feature: outcome_number_ratio_time_in_shelter_days
Add Golden Feature: intake_number_ratio_time_in_shelter_days
Add Golden Feature: time_in_shelter_days_ratio_intake_number
Add Golden Feature: time_in_shelter_days_ratio_outcome_number
Created 10 Golden Features in 3.79 seconds.
18_CatBoost_GoldenFeatures f1 0.877007 trained in 17.02 seconds
23_CatBoost_GoldenFeatures f1 0.877792 trained in 17.76 seconds
19_CatBoost_GoldenFeatures f1 0.878217 trained in 11.42 seconds
* Step kmeans_features



18_CatBoost_KMeansFeatures f1 0.876343 trained in 14.0 seconds




23_CatBoost_KMeansFeatures f1 0.880624 trained in 17.18 seconds




19_CatBoost_KMeansFeatures f1 0.880494 trained in 20.43 seconds
* Step insert_random_feature will try to check up to 1 model




log_loss_eps() got an unexpected keyword argument 'response_method'
Problem during computing permutation importance. Skipping ...
23_CatBoost_KMeansFeatures_RandomFeature f1 0.879201 trained in 13.75 seconds
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 15 models




34_CatBoost f1 0.878518 trained in 23.01 seconds




35_CatBoost f1 0.880087 trained in 12.97 seconds




36_CatBoost f1 0.877499 trained in 16.19 seconds
37_CatBoost f1 0.874749 trained in 20.42 seconds
38_CatBoost f1 0.875976 trained in 9.03 seconds
39_Xgboost f1 0.871665 trained in 7.42 seconds
40_Xgboost f1 0.870337 trained in 10.28 seconds
41_Xgboost f1 0.870118 trained in 10.93 seconds
42_Xgboost f1 0.872101 trained in 7.49 seconds
43_RandomForest f1 0.848625 trained in 18.54 seconds
44_RandomForest f1 0.846617 trained in 35.17 seconds
45_RandomForest f1 0.84058 trained in 16.26 seconds
46_DecisionTree f1 0.81103 trained in 4.54 seconds
47_DecisionTree f1 0.81103 trained in 4.45 seconds
48_DecisionTree f1 0.808139 trained in 4.36 seconds
* Step hill_climbing_2 will try to check up to 11 models




49_CatBoost f1 0.879811 trained in 12.73 seconds




50_CatBoost f1 0.879217 trained in 14.65 seconds




51_CatBoost f1 0.877685 trained in 15.75 seconds
52_CatBoost f1 0.876892 trained in 15.46 seconds
53_Xgboost f1 0.871841 trained in 9.74 seconds
54_Xgboost f1 0.871925 trained in 8.23 seconds
55_Xgboost f1 0.871795 trained in 12.23 seconds
56_Xgboost f1 0.870997 trained in 7.15 seconds
57_RandomForest f1 0.836995 trained in 30.22 seconds
58_RandomForest f1 0.84544 trained in 19.21 seconds
59_RandomForest f1 0.840677 trained in 39.42 seconds
* Step ensemble will try to check up to 1 model
Ensemble f1 0.884176 trained in 6.26 seconds
AutoML fit time: 1033.58 seconds
AutoML best model: Ensemble


In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 예측
preds_label = automl.predict(X_test)

# 평가 지표 계산
acc = accuracy_score(y_test, preds_label)
precision = precision_score(y_test, preds_label)
recall = recall_score(y_test, preds_label)
f1 = f1_score(y_test, preds_label)

print(f"Accuracy : {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")

Accuracy : 0.8850
Precision: 0.8401
Recall   : 0.8983
F1-score : 0.8682




In [46]:
!pip freeze > requirements.txt