# Abalone Project - Data preparation Pipeline for classification
Maria Eugênia Fonseca\
2021/10/05

In [12]:
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from feature_engine.encoding import OneHotEncoder
from feature_engine.transformation import BoxCoxTransformer

In [2]:
col_names = ['sex', 'length', 'diameter', 'height', 'whole_weight', 'shucked_weight', 'viscera_weight', 'shell_weight', 'rings']
 
data = pd.read_csv("../data/raw/abalone_data.txt", header = None, names=col_names)

In [3]:
# Creating the Age variable for classification:
data['age'] = data['rings'] + 1.5
data['age'] = pd.cut(data['age'], bins=[0, 8, 14, max(data['age'])], labels=["young", "middle age", "old"])

data.drop('rings', axis=1, inplace=True)

In [4]:
data['age'].value_counts()

middle age    3036
old            693
young          448
Name: age, dtype: int64

In [5]:
# Removing observations with zero height:
data = data[data['height'] != 0]

In [6]:
X = data.drop('age', axis=1)
y = data['age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=data['age'], random_state=123)

### Config

In [7]:
CATEGORICAL_VARS = ['sex']
NUMERICAL_BOXCOX_VARS = ['length', 'diameter', 'height', 'whole_weight','shucked_weight',
                           'viscera_weight', 'shell_weight']

### Pipeline
- One hot encoding
- Scaller

In [8]:
abalone_dataprep_pipeline = Pipeline(steps=[
    # == CATEGORICAL ENCODING
    ('one_hot_encoder', OneHotEncoder(
    variables=CATEGORICAL_VARS, drop_last=False)),
    
    # === SCALLER
    ('minmax_scaller', MinMaxScaler())
    
])

In [9]:
X_train_pipe = pd.DataFrame(abalone_dataprep_pipeline.fit_transform(X_train, y_train))

In [10]:
X_test_pipe = pd.DataFrame(abalone_dataprep_pipeline.transform(X_test))

In [11]:
X_train_pipe.to_csv("../data/processed/abalone_xtrain_class.csv", index=False)
y_train.to_csv("../data/processed/abalone_ytrain_class.csv", index=False)

X_test_pipe.to_csv("../data/processed/abalone_xtest_class.csv", index=False)
y_test.to_csv("../data/processed/abalone_ytest_class.csv", index=False)