# XGBOOST

## REGRESSÃO

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

data = fetch_california_housing()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#model = xgb.XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
model = xgb.XGBRegressor(n_estimators=7)
model.fit(X_train, y_train)

# Acessando as árvores ajustadas
booster = model.get_booster()

In [None]:
data.feature_names, len(data.feature_names)

In [None]:
booster.save_model('model.json')

In [None]:
trees = booster.get_dump(with_stats=False)  # Inclui estatísticas adicionais
len(trees)

### 1.1 Exibir as árvores

In [None]:
# Exibindo as árvores
for i, tree in enumerate(trees):
    print(f"Tree {i}:")
    print(tree)
    print("\n" + "="*50 + "\n")


In [None]:
print(trees[0])

In [None]:
feature_names = data.feature_names

# exibe as árvores com nomes de features
trees = booster.get_dump(with_stats=True)

# f para substituir f0, f1... pelos nomes das features
def replace_feature_names(tree, feature_names):
    for i, feature in enumerate(feature_names):
        tree = tree.replace(f"f{i}", feature)
    return tree

for i, tree in enumerate(trees):
    print(f"Tree {i}:")
    print(replace_feature_names(tree, feature_names))
    print("\n" + "="*50 + "\n")
    break


In [None]:
booster.trees_to_dataframe()

## CLASSIFICAÇÃO

In [None]:
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#model = xgb.XGBClassifier(n_estimators=7, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss')
model = xgb.XGBClassifier(n_estimators=7)
model.fit(X_train, y_train)



### 1.1 Exibir plot das trees

In [None]:
xgb.plot_tree(model, num_trees=0)

In [None]:
for i in range(len(trees)):
  xgb.plot_tree(model, num_trees=i)

### 1.2 Pegando dados das Árvores

In [None]:
# Acessando as árvores ajustadas
booster = model.get_booster()

# Obtendo as árvores em formato de texto
trees = booster.get_dump(with_stats=False)

In [None]:
"""
    ** get_dump(fmap='', with_stats=False, dump_format='text')
    Returns the model dump as a list of strings. Unlike save_model(), the output format is
    primarily used for visualization or interpretation, hence it’s more human readable but cannot
    be loaded back to XGBoost.

    Parameters:

            fmap (str | PathLike) – Name of the file containing feature map names.

            with_stats (bool) – Controls whether the split statistics are output.

            dump_format (str) – Format of model dump. Can be ‘text’, ‘json’ or ‘dot’.

    Return type:
        List[str]
"""


trees = booster.get_dump(with_stats=False, dump_format='json')
print(trees[0])

In [None]:
trees = booster.get_dump(with_stats=False, dump_format='text')
print(trees[0])

In [None]:
trees = booster.get_dump(with_stats=False)
print(trees[0])

### 1.3 Quantidade de Árvores

In [None]:
print(trees)
print(len(trees))

print()
classes = model.classes_
print(classes, len(classes))

feature_names = data.feature_names
print(feature_names, len(feature_names))

print()
print('Quantidade de Árvores: ', len(classes) * len(trees)/(len(classes)))

### 1.4 Exibindo árvores

In [None]:
# Exibindo as árvores com os índices das features
for i, tree in enumerate(trees):
    print(f"Tree {i}:")
    print(tree)
    print("\n" + "="*50 + "\n")



In [None]:
# Exibindo as árvores com os nomes das features (igual ao que fizemos na regressão)
feature_names = data.feature_names  # Nomes das features do dataset Iris


def replace_feature_names(tree, feature_names):
    for i, feature in enumerate(feature_names):
        tree = tree.replace(f"f{i}", feature)
    return tree

for i, tree in enumerate(trees):
    print(f"Tree {i}:")
    print(replace_feature_names(tree, feature_names))
    print("\n" + "="*50 + "\n")

### 1.5 mY method

In [None]:
import re

def parse_tree(tree_str):
    tree = []
    lines = tree_str.split('\n')  # Dividindo a árvore linha por linha

    for line in lines:
        # Removendo indentação e espaços
        line = line.strip()

        # Verificando se é um nó ou uma folha
        if 'leaf' in line:
            # Extraindo o número do nó e o valor da folha
            node_num = re.search(r'(\d+):leaf=([-\d.]+)', line)
            if node_num:
                tree.append({'node': int(node_num.group(1)), 'leaf': float(node_num.group(2))})
        else:
            # Extraindo informações de nós
            node_info = re.search(r'(\d+):\[f(\d+)<([-\d.]+)\] yes=(\d+),no=(\d+),missing=(\d+)', line)
            if node_info:
                tree.append({
                    'node': int(node_info.group(1)),
                    'feature': int(node_info.group(2)),
                    'threshold': float(node_info.group(3)),
                    'yes': int(node_info.group(4)),
                    'no': int(node_info.group(5)),
                    'missing': int(node_info.group(6))
                })
    return tree

# Exemplo de uso
i = 1
tree_0 = trees[i]
parsed_tree = parse_tree(tree_0)
print("Fomated ", parsed_tree)
print("Orig ", trees[i])


# SKLEARN

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]


clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

### 1.1 Mostrando estimators_

In [None]:
print(clf.estimators_)

In [None]:
from sklearn.tree import export_text

for i, tree in enumerate(clf.estimators_):
    print(f"Tree {i}")
    tree_model = tree[0]
    tree_rules = export_text(tree_model)
    print(tree_rules)
