# Gradient Boosting Classifier
Creates an ensemble of weak predictions decision trees.

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# saving models
import pickle

## Load Data

In [12]:
ipos = pd.read_csv('../data/clean_bloomberg_with_sectors.csv')
ipo_labels = ipos["Underpriced"].tolist()
ipo_features = ipos._get_numeric_data().columns.values.tolist()[1:-1]
ipo_features.remove('Offer To 1st Close')

## Split data into Training and Testing sets
The data is split into 70% training data and 30% test data.

In [13]:
ipo_features = ipo_features[:-1]
ipo_features_data = ipos[ipo_features]
train_data, test_data, train_labels, test_labels = train_test_split(ipo_features_data, ipo_labels, test_size=0.3)

## Define Hyperparameters
- `n_estimators:` How many decisions trees should be creates.
- `learning_rate:` Step size at each iteration of training.
- `loss:` Define a type of loss function.

In [14]:
params = {
    "n_estimators": 500,
    "learning_rate": 0.01,
    "loss": "log_loss",
}

## Train model

In [15]:
reg = GradientBoostingClassifier(**params)
reg.fit(train_data, train_labels)

In [49]:
# save the model
with open('saved_models/gradient_boosting_classifier.pkl', 'wb') as file:
  pickle.dump(reg, file)

In [16]:
ipo_labels_pred_gb = reg.predict(test_data)

### Accuracy

In [17]:
print("Accuracy:", metrics.accuracy_score(test_labels, ipo_labels_pred_gb.round()))

Accuracy: 0.7294117647058823
