# Abalone Project - Data preparation Pipeline
Maria Eugênia Fonseca\
2021/09/25

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from feature_engine.encoding import OneHotEncoder
from feature_engine.transformation import BoxCoxTransformer

import mlflow

In [2]:
col_names = ['sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings']

data = pd.read_csv("../data/raw/abalone_data.txt", header = None, names=col_names)

In [3]:
# Creating Age variable:
data['age'] = data['rings'] + 1.5
data.drop('rings', axis=1, inplace=True)

In [4]:
# Removing observations with zero height:
data = data[data['height'] != 0]

In [5]:
X = data.drop('age', axis=1)
y = data['age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=data['sex'])

### Config

In [6]:
CATEGORICAL_VARS = ['sex']
NUMERICAL_BOXCOX_VARS = ['length', 'diameter', 'height', 'whole_weight','shucked_weight',
                           'viscera_weight', 'shell_weight']

### Pipeline 1
- One hot encoding

In [7]:
abalone_dataprep_pipeline1 = Pipeline(steps=[
    # == CATEGORICAL ENCODING
    ('one_hot_encoder', OneHotEncoder(
    variables=CATEGORICAL_VARS, drop_last=False))
])

In [8]:
X_train_pipe1 = abalone_dataprep_pipeline1.fit_transform(X_train, y_train)

### Pipeline 2
- One hot encoding
- BoxCox Transformeer

In [9]:
abalone_dataprep_pipeline2 = Pipeline(steps=[
    # == CATEGORICAL ENCODING
    ('one_hot_encoder', OneHotEncoder(
    variables=CATEGORICAL_VARS, drop_last=False)),
    
    # ==== VARIABLE TRANSFORMATION
    ('boxcox_transformer', BoxCoxTransformer(
    variables=NUMERICAL_BOXCOX_VARS))    
])

In [10]:
X_train_pipe2 = abalone_dataprep_pipeline2.fit_transform(X_train, y_train)

### Pipeline 3
- One hot encoding
- BoxCox Transformeer
- Scaller

In [11]:
abalone_dataprep_pipeline3 = Pipeline(steps=[
    # == CATEGORICAL ENCODING
    ('one_hot_encoder', OneHotEncoder(
    variables=CATEGORICAL_VARS, drop_last=False)),
    
    # ==== VARIABLE TRANSFORMATION
    ('boxcox_transformer', BoxCoxTransformer(
    variables=NUMERICAL_BOXCOX_VARS)),
    
    # === SCALLER
    ('minmax_scaller', MinMaxScaler())
    
])

In [12]:
X_train_pipe3 = abalone_dataprep_pipeline3.fit_transform(X_train, y_train)

### Pipeline 4
- One hot encoding
- Scaller

In [13]:
abalone_dataprep_pipeline4 = Pipeline(steps=[
    # == CATEGORICAL ENCODING
    ('one_hot_encoder', OneHotEncoder(
    variables=CATEGORICAL_VARS, drop_last=False)),
    
    # === SCALLER
    ('minmax_scaller', MinMaxScaler())
    
])

In [14]:
X_train_pipe4 = abalone_dataprep_pipeline4.fit_transform(X_train, y_train)

### Linear regression

In [15]:
EXPERIMENT_NAME = "abalone"

try:
    mlflow.create_experiment("abalone")
    print("CREATING")
except:
    print("ALREADY EXISTS")

mlflow.set_experiment(EXPERIMENT_NAME)

ALREADY EXISTS


In [16]:
def evaluate_regression_and_log(regression_model, X_train, y_train, params):    
    regression_model = regression_model.fit(X_train, y_train)
    
    # metrics - train
    y_train_pred = regression_model.predict(X_train)
    rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
    r2_train = r2_score(y_train, y_train_pred)
    
    # log
    mlflow.start_run()

    mlflow.log_metric("rmse_train", rmse_train)
    mlflow.log_metric("r2_train", r2_train)

    mlflow.sklearn.log_model(regression_model, "model")
    mlflow.log_param("model_name", type(regression_model).__name__)
    mlflow.log_params(params)
    
    mlflow.end_run()

In [17]:
lm = LinearRegression()

#### Pipeline 1

In [18]:
params_pipe1 = {
    "boxcox_transformer": "false",
    "minmax_scaller": "false"}

In [19]:
evaluate_regression_and_log(lm, X_train_pipe1, y_train, params_pipe1)

#### Pipeline 2

In [20]:
params_pipe2 = {
    "boxcox_transformer": "true",
    "minmax_scaller": "false"}

In [21]:
evaluate_regression_and_log(lm, X_train_pipe2, y_train, params_pipe2)

#### Pipeline 3

In [22]:
params_pipe3 = {
    "boxcox_transformer": "true",
    "minmax_scaller": "true"}

In [23]:
evaluate_regression_and_log(lm, X_train_pipe3, y_train, params_pipe3)

#### Pipeline 4

In [24]:
params_pipe4 = {
    "boxcox_transformer": "false",
    "minmax_scaller": "true"}

In [25]:
evaluate_regression_and_log(lm, X_train_pipe4, y_train, params_pipe4)

Using boxcox transformer and/or minmax scaller didn't make a lot of difference. Thus, we will only use minmax scaller in the next steps.

In [26]:
X_train_pipe4 = pd.DataFrame(abalone_dataprep_pipeline4.fit_transform(X_train, y_train))

In [27]:
X_test_pipe4 = pd.DataFrame(abalone_dataprep_pipeline4.transform(X_test))

In [28]:
X_train_pipe4.to_csv("../data/processed/abalone_xtrain.csv", index=False)
y_train.to_csv("../data/processed/abalone_ytrain.csv", index=False)

X_test_pipe4.to_csv("../data/processed/abalone_xtest.csv", index=False)
y_test.to_csv("../data/processed/abalone_ytest.csv", index=False)