In [1]:
!pip install lightgbm



In [2]:
import lightgbm as lgb

OSError: dlopen(/opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /opt/anaconda3/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

In [None]:
!pip install valohai-utils

In [None]:
print('Hello Valohai Notebooks!')

import valohai
import pandas as pd

# Define inputs available for this step and their default location
# The default location can be overriden when you create a new execution (UI, API or CLI)
default_inputs = {
    'myinput': 's3://vmip-evaluation-boston-housing/train.csv'
}

# Create a step 'Train Model' in valohai.yaml with a set of inputs
valohai.prepare(step="Preprocessing", default_inputs=default_inputs)
    
train_dataset = pd.read_csv(valohai.inputs("myinput").path())

In [3]:
# %%capture
# %matplotlib inline

In [4]:
#Data exploration

import matplotlib.pyplot as plt

plt.plot(train_dataset["CRIM"], train_dataset["MEDV"])

train_dataset.plot(kind="scatter", x="CRIM", y="MEDV")
train_dataset.plot(kind="scatter", x="RM", y="MEDV")
train_dataset.plot(kind="scatter", x="LSTAT", y="MEDV")
train_dataset.plot(kind="scatter", x="ZN", y="MEDV")

save_plot_path = valohai.outputs().path('myplot.png')
plt.savefig(save_plot_path)

plt.show()
plt.close()

NameError: name 'train_dataset' is not defined

In [None]:
df_train = train_dataset

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42, shuffle=True)


y_train = df_train["MEDV"]
X_train = df_train.drop(columns=["MEDV"])
y_test = df_test["MEDV"]
X_test = df_test.drop(columns=["MEDV"])

In [None]:
import lightgbm as lgb

#Prepare model
default_parameters = {
    'num_leaves': 31,
    'learning_rate': 0.05,
    'n_estimators': 20
}

valohai.prepare(step="Train", default_parameters=default_parameters)

gbm = lgb.LGBMRegressor(num_leaves=valohai.parameters('num_leaves').value,
                        learning_rate=valohai.parameters('learning_rate').value,
                        n_estimators=valohai.parameters('n_estimators').value)

gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

In [None]:
from sklearn.metrics import mean_squared_error

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

In [None]:
# feature importances
print(f'Feature importances: {list(gbm.feature_importances_)}')

In [None]:
import numpy as np
# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle,
        early_stopping_rounds=5)

In [None]:
# another self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Relative Absolute Error (RAE)
def rae(y_true, y_pred):
    return 'RAE', np.sum(np.abs(y_pred - y_true)) / np.sum(np.abs(np.mean(y_true) - y_true)), False

print('Starting training with multiple custom eval functions...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=[rmsle, rae],
        early_stopping_rounds=5)

In [None]:
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmsle_test = rmsle(y_test, y_pred)[1]
rae_test = rae(y_test, y_pred)[1]
print(f'The RMSLE of prediction is: {rmsle_test}')
print(f'The RAE of prediction is: {rae_test}')



In [None]:
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 1],
    'n_estimators': [20, 50, 100]
}

from sklearn.model_selection import GridSearchCV

gbm = GridSearchCV(estimator, param_grid, cv=3)
model = gbm.fit(X_train, y_train)

print(f'Best parameters found by grid search are: {gbm.best_params_}')

In [1]:
# train with best parameters:

gbm = lgb.LGBMRegressor(num_leaves=43,
                        learning_rate=0.05,
                        n_estimators=20)

gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

from sklearn.metrics import mean_squared_error, r2_score

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

# evaluate lightgbm ensemble for regression
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold


# evaluate the model

rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

# accuracy

r_squared = r2_score(y_test, y_pred, sample_weight=None, multioutput='uniform_average')

print(f'R_sqaured is: {r_squared}')

import joblib

out_pred_path = valohai.outputs().path('mydata.csv')
X_train.to_csv(out_pred_path)

with valohai.logger() as logger:
    logger.log("R_sqaured", r_squared)
    
out_model_path = valohai.outputs().path('boston_model.pkl')
joblib.dump(gbm, out_model_path)

NameError: name 'lgb' is not defined