In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.special import expit
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split, cross_validate
from perpetual import PerpetualBooster

In [2]:
pd.set_option('display.max_rows', 1000)

In [3]:
!python --version

Python 3.9.20


In [4]:
from importlib.metadata import version

print(f"numpy: {version('numpy')}")
print(f"optuna: {version('optuna')}")
print(f"lightgbm: {version('lightgbm')}")
print(f"scikit-learn: {version('scikit-learn')}")
print(f"perpetual: {version('perpetual')}")

numpy: 1.26.4
optuna: 4.0.0
lightgbm: 4.5.0
scikit-learn: 1.3.2
perpetual: 0.4.9


In [5]:
df = sns.load_dataset("titanic")

In [6]:
df.drop(columns=["alive"], inplace=True)

In [7]:
X = df.drop(columns=["survived"])
y = df["survived"]

In [8]:
X.shape

(891, 13)

In [9]:
X.dtypes

pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alone              bool
dtype: object

In [10]:
X.nunique()

pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alone            2
dtype: int64

In [11]:
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,False
1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,False
2,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,True
3,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,False
4,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,True


In [12]:
X["sex"] = pd.get_dummies(X["sex"], drop_first=True, dtype=float).to_numpy()
X["adult_male"] = pd.get_dummies(X["adult_male"], drop_first=True, dtype=float).to_numpy()
# X["alive"] = pd.get_dummies(X["alive"], drop_first=True, dtype=float).to_numpy()
X["alone"] = pd.get_dummies(X["alone"], drop_first=True, dtype=float).to_numpy()
cols = ['pclass', 'sibsp', 'parch', 'embarked', 'class', 'who', 'deck', 'embark_town', 'age', 'fare']
X[cols] = X[cols].astype('category')
X.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
0,3,1.0,22.0,1,0,7.25,S,Third,man,1.0,,Southampton,0.0
1,1,0.0,38.0,1,0,71.2833,C,First,woman,0.0,C,Cherbourg,0.0
2,3,0.0,26.0,0,0,7.925,S,Third,woman,0.0,,Southampton,1.0
3,1,0.0,35.0,1,0,53.1,S,First,woman,0.0,C,Southampton,0.0
4,3,1.0,35.0,0,0,8.05,S,Third,man,1.0,,Southampton,1.0


In [13]:
seed = 42
n_estimators = 100
n_trials = 1

In [14]:
scoring = "neg_log_loss"
metric_function = log_loss
metric_name = "log_loss"
objective_type = "LogLoss"

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

print(f"X_train.shape: {X_train.shape}")
print(f"X_test.shape: {X_test.shape}")

X_train.shape: (712, 13)
X_test.shape: (179, 13)


In [16]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
331,1,1.0,45.5,0,0,28.5,S,First,man,1.0,C,Southampton,1.0
733,2,1.0,23.0,0,0,13.0,S,Second,man,1.0,,Southampton,1.0
382,3,1.0,32.0,0,0,7.925,S,Third,man,1.0,,Southampton,1.0
704,3,1.0,26.0,1,0,7.8542,S,Third,man,1.0,,Southampton,0.0
813,3,0.0,6.0,4,2,31.275,S,Third,child,0.0,,Southampton,0.0


In [17]:
set(X_train["who"])

{'child', 'man', 'woman'}

In [18]:
model_lgbm = LGBMClassifier(objective="binary")
model_lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000267 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 172
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838


In [19]:
model = PerpetualBooster(objective="LogLoss")
model.fit(X_train, y_train, budget=0.1)

Categorical features: [0, 2, 3, 4, 5, 6, 7, 8, 10, 11]
Mapping of categories: {'pclass': ['nan', '1', '2', '3'], 'age': ['nan', '0.42', '0.67', '0.75', '0.83', '0.92', '1.0', '10.0', '11.0', '12.0', '13.0', '14.0', '14.5', '15.0', '16.0', '17.0', '18.0', '19.0', '2.0', '20.0', '21.0', '22.0', '23.0', '24.0', '24.5', '25.0', '26.0', '27.0', '28.0', '28.5', '29.0', '3.0', '30.0', '31.0', '32.0', '32.5', '33.0', '34.0', '34.5', '35.0', '36.0', '36.5', '37.0', '38.0', '39.0', '4.0', '40.0', '40.5', '41.0', '42.0', '43.0', '44.0', '45.0', '45.5', '46.0', '47.0', '48.0', '49.0', '5.0', '50.0', '51.0', '52.0', '53.0', '54.0', '55.0', '55.5', '56.0', '57.0', '58.0', '59.0', '6.0', '60.0', '61.0', '62.0', '63.0', '64.0', '65.0', '7.0', '70.0', '70.5', '74.0', '8.0', '80.0', '9.0'], 'sibsp': ['nan', '0', '1', '2', '3', '4', '5', '8'], 'parch': ['nan', '0', '1', '2', '3', '4', '5', '6'], 'fare': ['nan', '0.0', '10.1708', '10.4625', '10.5', '10.5167', '106.425', '108.9', '11.1333', '11.2417', '11.

<perpetual.booster.PerpetualBooster at 0x27ff75747c0>

In [20]:
y_pred = np.round(expit(model.predict(X_test)))
print(accuracy_score(y_test, y_pred))

0.7150837988826816


In [21]:
y_pred = np.round(expit(model.predict(X_train)))
print(accuracy_score(y_train, y_pred))

0.9087078651685393


In [22]:
if metric_name == "log_loss":
    y_pred = expit(model.predict(X_test))
else:
    y_pred = np.round(expit(model.predict(X_test)))
print(f"Test {metric_name}: {round(metric_function(y_test, y_pred), 6)}")

Test log_loss: 0.687281


In [23]:
df_trees = model.trees_to_dataframe()

In [24]:
df_trees.head(10)

Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover,Left_Cats,Right_Cats
i64,i64,str,str,f64,str,str,str,f64,f64,list[i64],list[i64]
0,0,"""0-0""","""fare""",177.0,"""0-1""","""0-2""","""0-2""",236.18161,167.1236,"[156, 21, … 162]","[53, 103, … 205]"
0,1,"""0-1""","""Leaf""",,,,,-0.742395,103.04388,[],[]
0,2,"""0-2""","""Leaf""",,,,,1.186879,64.079704,[],[]
1,0,"""1-0""","""who""",1.0,"""1-1""","""1-2""","""1-2""",136.94548,136.9996,[2],"[1, 3]"
1,1,"""1-1""","""Leaf""",,,,,-0.729689,80.21069,[],[]
1,2,"""1-2""","""Leaf""",,,,,0.870098,56.788895,[],[]
2,0,"""2-0""","""fare""",173.0,"""2-1""","""2-2""","""2-2""",107.65649,112.9132,"[160, 197, … 45]","[173, 202, … 13]"
2,1,"""2-1""","""Leaf""",,,,,-0.718048,64.415764,[],[]
2,2,"""2-2""","""Leaf""",,,,,0.8346264,48.49744,[],[]
3,0,"""3-0""","""fare""",102.0,"""3-1""","""3-2""","""3-2""",31.42477,104.092964,"[125, 2, … 54]","[191, 131, … 157]"


In [26]:
model_lgbm.booster_.trees_to_dataframe().head(10)

Unnamed: 0,tree_index,node_depth,node_index,left_child,right_child,parent_index,split_feature,split_gain,threshold,decision_type,missing_direction,missing_type,value,weight,count
0,0,1,0-S0,0-S2,0-S1,,who,219.733002,1.0,==,right,,-0.504838,0.0,712
1,0,2,0-S2,0-S5,0-S6,0-S0,pclass,14.1855,0.0,==,right,,-0.597151,101.401,432
2,0,3,0-S5,0-L0,0-S11,0-S2,embarked,1.28327,0.0,==,right,,-0.524748,21.3599,91
3,0,4,0-L0,,,0-S5,,,,,,,-0.488909,6.807,29
4,0,4,0-S11,0-L6,0-L12,0-S5,alone,0.044977,0.0,<=,left,,-0.541511,14.5529,62
5,0,5,0-L6,,,0-S11,,,,,,,-0.549007,5.163931,22
6,0,5,0-L12,,,0-S11,,,,,,,-0.537388,9.388966,40
7,0,3,0-S6,0-L3,0-S8,0-S2,embarked,0.795522,0.0,==,right,,-0.616473,80.0409,341
8,0,4,0-L3,,,0-S6,,,,,,,-0.588731,9.154242,39
9,0,4,0-S8,0-L7,0-S9,0-S6,alone,0.531171,0.0,<=,left,,-0.620055,70.8867,302
