In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [4]:
train_data = pd.read_csv('train_normalized.csv')
test_data = pd.read_csv('test_normalized.csv')

X_train = train_data.drop('satisfaction', axis=1)
y_train = train_data['satisfaction']

X_test = test_data.drop('satisfaction', axis=1)
y_test = test_data['satisfaction']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


Vamos inciar um GradientBoost simples, com valores padrões e a partir deles vamos tentar melhorar a acuracia

In [4]:
# Inicializar o modelo
model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9421773945180166
Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     14573
           1       0.93      0.93      0.93     11403

    accuracy                           0.94     25976
   macro avg       0.94      0.94      0.94     25976
weighted avg       0.94      0.94      0.94     25976



In [5]:
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.01, max_depth=3, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9002155836156452
Test Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.90      0.91     14573
           1       0.88      0.90      0.89     11403

    accuracy                           0.90     25976
   macro avg       0.90      0.90      0.90     25976
weighted avg       0.90      0.90      0.90     25976



o resultado foi muito pior, vamos ver se foi por causa do learningrate menor fazendo ele não ter tempo de aprender

In [6]:
model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9513781952571605
Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     14573
           1       0.95      0.94      0.94     11403

    accuracy                           0.95     25976
   macro avg       0.95      0.95      0.95     25976
weighted avg       0.95      0.95      0.95     25976



Chegamos em um resultado ainda melhor que o primeiro caso, vamos tentar aumentar mais uma vez o n_estimators antes de mexer na learning rate

In [7]:
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9548044348629504
Test Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     14573
           1       0.95      0.94      0.95     11403

    accuracy                           0.95     25976
   macro avg       0.95      0.95      0.95     25976
weighted avg       0.95      0.95      0.95     25976



O resultado ainda aumentou um pouco, vamos agora aumentar o profundidade maxima e diminuir a learning rate, para deixar o modelo mais complexo e ver como ele lida com isso no nosso dataset

In [8]:
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9546889436402833
Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     14573
           1       0.95      0.94      0.95     11403

    accuracy                           0.95     25976
   macro avg       0.95      0.95      0.95     25976
weighted avg       0.95      0.95      0.95     25976



A acuracia se manteve basicamente a mesma, vamos tentar o mesmo modelo com learning rate de 0.1

In [10]:
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=4, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9587696335078534
Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     14573
           1       0.96      0.95      0.95     11403

    accuracy                           0.96     25976
   macro avg       0.96      0.96      0.96     25976
weighted avg       0.96      0.96      0.96     25976



Conseguimos aumentar ainda mais a acuracia final, podemos perceber que aparentemente o modelo não esta sofrendo overfiting para esses parametros, vamos tentar aumentar mais uma vez a profundidade e ver como ele se comporta

In [11]:
model = GradientBoostingClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9606174930705267
Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     14573
           1       0.96      0.95      0.95     11403

    accuracy                           0.96     25976
   macro avg       0.96      0.96      0.96     25976
weighted avg       0.96      0.96      0.96     25976



Conseguimos chegar em uma acuracia de 0.96, vamos tentar aumentar uma ultima vez o n_estimatores 

In [12]:
model = GradientBoostingClassifier(n_estimators=400, learning_rate=0.1, max_depth=5, random_state=42)

# Treinar o modelo
model.fit(X_train, y_train)

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.9608869725900832
Test Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97     14573
           1       0.96      0.95      0.96     11403

    accuracy                           0.96     25976
   macro avg       0.96      0.96      0.96     25976
weighted avg       0.96      0.96      0.96     25976



O resultado não se pagou. Pode ter acontecido overfiting, ja que o número de estimators está muito alto para uma learning rate elevada. Ficamos por enquanto com o melhor modelo sendo: n_estimatores=300, learning_rate=0.1 e max_depth=5, agora vamos fazer um grid search com novos parametros para ver se conseguimos melhores resultados

In [6]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

model = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

print(f"Melhores parâmetros: {grid_search.best_params_}")

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy}")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


KeyboardInterrupt: 