# Pipeline to automate data preprocessing and model training

## Load data

In [79]:
import seaborn as sns

In [92]:
df_base = sns.load_dataset('mpg', index_col='name')
df_base['cylinders'] = df_base['cylinders'].astype(str)
df_base

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
ford mustang gl,27.0,4,140.0,86.0,2790,15.6,82,usa
vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe
dodge rampage,32.0,4,135.0,84.0,2295,11.6,82,usa
ford ranger,28.0,4,120.0,79.0,2625,18.6,82,usa


In [93]:
df_base.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [94]:
mask = df_base.isna().sum(axis=1) > 0

In [95]:
df_base[mask]

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ford pinto,25.0,4,98.0,,2046,19.0,71,usa
ford maverick,21.0,6,200.0,,2875,17.0,74,usa
renault lecar deluxe,40.9,4,85.0,,1835,17.3,80,europe
ford mustang cobra,23.6,4,140.0,,2905,14.3,80,usa
renault 18i,34.5,4,100.0,,2320,15.8,81,europe
amc concord dl,23.0,4,151.0,,3035,20.5,82,usa


In [96]:
df_base

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
ford mustang gl,27.0,4,140.0,86.0,2790,15.6,82,usa
vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe
dodge rampage,32.0,4,135.0,84.0,2295,11.6,82,usa
ford ranger,28.0,4,120.0,79.0,2625,18.6,82,usa


In [97]:
df_base = df_base.dropna()

In [98]:
df_base

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,usa
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,usa
plymouth satellite,18.0,8,318.0,150.0,3436,11.0,70,usa
amc rebel sst,16.0,8,304.0,150.0,3433,12.0,70,usa
ford torino,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
ford mustang gl,27.0,4,140.0,86.0,2790,15.6,82,usa
vw pickup,44.0,4,97.0,52.0,2130,24.6,82,europe
dodge rampage,32.0,4,135.0,84.0,2295,11.6,82,usa
ford ranger,28.0,4,120.0,79.0,2625,18.6,82,usa


In [99]:
df_base.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

## Feature selection

In [100]:
df_base.to_csv('../../data/cars.csv')

In [101]:
target = 'mpg'

y = df_base[target]
X = df_base.drop(columns=target)

## Train test split

In [102]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Pipeline

### Data preprocessing

In [103]:
features = X.dtypes

features_categorical = features[features == 'object'].index
features_categorical

Index(['cylinders', 'origin'], dtype='object')

In [104]:
features_numerical = features[features != 'object'].index
features_numerical

Index(['displacement', 'horsepower', 'weight', 'acceleration', 'model_year'], dtype='object')

In [105]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), features_categorical),
        ('scaler', MinMaxScaler(), features_numerical)
    ])

### Model

In [106]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 7, 9, 11],
    'learning_rate': [0.001, 0.01, 0.1, 1]
}

cv = GridSearchCV(model, param_grid)

## Alltogether

In [107]:
X_train.isna().sum()

cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
dtype: int64

In [108]:
y_train

name
datsun 610                22.0
ford maverick             21.0
datsun 510 hatchback      37.0
amc gremlin               21.0
chevrolet nova            15.0
                          ... 
amc matador (sw)          15.0
amc gremlin               18.0
oldsmobile starfire sx    23.8
ford escort 2h            29.9
chevrolet impala          11.0
Name: mpg, Length: 274, dtype: float64

In [109]:
X_train_pre = preprocessor.fit_transform(X_train)

In [111]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cv)
])

pipeline.fit(X_train, y_train)

## Export model to use in production

In [113]:
import pickle

path = '../../artifacts/pipeline_mpg.pkl'

with open(path, 'wb') as file:
    pickle.dump(pipeline, file)