# Abalone Project - Model Regression
Maria Eugênia Fonseca\
2021/09/28

In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import mlflow
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
X_train = pd.read_csv("../data/processed/abalone_xtrain.csv")
y_train = pd.read_csv("../data/processed/abalone_ytrain.csv")

X_test = pd.read_csv("../data/processed/abalone_xtest.csv")
y_test = pd.read_csv("../data/processed/abalone_ytest.csv")

In [3]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.641892,0.647059,0.111607,0.30297,0.320741,0.262014,0.18286,1.0,0.0,0.0
1,0.655405,0.630252,0.107143,0.261386,0.247778,0.188282,0.215247,0.0,1.0,0.0
2,0.439189,0.394958,0.080357,0.09559,0.088519,0.078999,0.074738,0.0,1.0,0.0
3,0.736486,0.714286,0.125,0.455086,0.464815,0.338381,0.306428,0.0,0.0,1.0
4,0.858108,0.848739,0.1875,0.724752,0.681852,0.597103,0.562531,0.0,0.0,1.0


#### Start mlflow experiment:

In [4]:
EXPERIMENT_NAME = "abalone"

try:
    mlflow.create_experiment(EXPERIMENT_NAME)
    print("CREATING")
except:
    print("ALREADY EXISTS")

mlflow.set_experiment(EXPERIMENT_NAME)

ALREADY EXISTS


#### Function to evaluate regression and log metrics in mlflow:

In [5]:
def evaluate_regression_and_log(regression_model, X_train, y_train, X_test, y_test, params):    
    # metrics - train
    y_train_pred = regression_model.predict(X_train)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    r2_train = r2_score(y_train, y_train_pred)

    # metrics - test
    y_test_pred = regression_model.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
    r2_test = r2_score(y_test, y_test_pred)
    
    # log metrics    
    mlflow.start_run()

    mlflow.log_metric("rmse_train", rmse_train)
    mlflow.log_metric("r2_train", r2_train)
    mlflow.log_metric("rmse_test", rmse_test)
    mlflow.log_metric("r2_test", r2_test)

    mlflow.sklearn.log_model(regression_model, "model")
    mlflow.log_param("model_name", type(regression_model).__name__)
    mlflow.log_params(params)
    
    mlflow.end_run()

In [6]:
params_pipe = {
    "boxcox_transformer": "false",
    "minmax_scaller": "true"}

#### First model - LinearRegression

In [7]:
lm = linear_model.LinearRegression()
lm = lm.fit(X_train, y_train)

evaluate_regression_and_log(lm, X_train, y_train, X_test, y_test, params_pipe)

#### Second model - GradientBoostingRegressor default

In [8]:
params = {
    "boxcox_transformer": "false",
    "minmax_scaller": "true",
    "hyperparameter_tuning": "false"}

In [9]:
gbr_base = GradientBoostingRegressor(random_state=123)
gbr_base = gbr_base.fit(X_train, y_train)

evaluate_regression_and_log(gbr_base, X_train, y_train, X_test, y_test, params)

  return f(*args, **kwargs)


In [10]:
gbr_base.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 123,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

#### Third model - GradientBoostingRegressor with hyperparameter tuning

In [11]:
params = {
    "boxcox_transformer": "false",
    "minmax_scaller": "true",
    "hyperparameter_tuning": "true"}

In [12]:
learning_rate = [0.03, 0.04, 0.05, 0.06]
n_estimators = [140, 150, 160]
subsample = [0.7, 0.75, 0.8, 0.85]
min_samples_leaf = [10, 12, 15]
max_depth = [4, 5, 6] 
max_features = ['auto', 'sqrt']
warm_start = [True]

# Create the grid
random_grid = {'learning_rate': learning_rate,
               'n_estimators': n_estimators,
               'subsample': subsample,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf,
               'max_depth': max_depth,
               'max_features': max_features,
               'warm_start': warm_start}

In [13]:
gbr = GradientBoostingRegressor(random_state=123)

# Random search of parameters, using 3 fold cross validation, 
gbr_tuning = GridSearchCV(estimator=gbr, param_grid=random_grid, cv=3, verbose=10, n_jobs=-1)

# Fit the random search model
gbr_tuning.fit(X_train, y_train)

Fitting 3 folds for each of 864 candidates, totalling 2592 fits


  return f(*args, **kwargs)


GridSearchCV(cv=3, estimator=GradientBoostingRegressor(random_state=123),
             n_jobs=-1,
             param_grid={'learning_rate': [0.03, 0.04, 0.05, 0.06],
                         'max_depth': [4, 5, 6],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [10, 12, 15],
                         'n_estimators': [140, 150, 160],
                         'subsample': [0.7, 0.75, 0.8, 0.85],
                         'warm_start': [True]},
             verbose=10)

In [14]:
gbr_tuning.best_params_

{'learning_rate': 0.03,
 'max_depth': 6,
 'max_features': 'sqrt',
 'min_samples_leaf': 12,
 'n_estimators': 160,
 'subsample': 0.7,
 'warm_start': True}

In [15]:
gbr_tuned = gbr_tuning.best_estimator_

evaluate_regression_and_log(gbr_tuned, X_train, y_train, X_test, y_test, params)

In [16]:
feat_imp_gbr = {'column_name': X_train.columns, 
            'importance': gbr_tuned.feature_importances_}

feat_imp_gbr = pd.DataFrame(feat_imp_gbr).sort_values(by=['importance'], ascending=False)

In [17]:
f, ax = plt.subplots(figsize=(10, 8))

sns.barplot(x=feat_imp_gbr.iloc['importance'], y=feat_imp_gbr.iloc['column_name'], color='#94d1e3')

ax.text(x=0, y=1, s='Feature importance', fontsize=18, weight='bold', ha='left', va='bottom', transform=ax.transAxes)

ax.set(xlabel="Importance", ylabel="")
plt.xticks(np.arange(0, max(feat_imp_gbr[:20]['importance']), 0.1))

ax.grid(False)
sns.despine(bottom = False, left = False)
sns.set_style("whitegrid") 

#plt.savefig('/content/drive/My Drive/Maria Eugênia Fonseca/Gráficos e mapas/top20_gbr_comodidades.png', quality=100)
plt.show()

AttributeError: module 'matplotlib' has no attribute 'subplots'

#### Forth model - Lasso default