In [4]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

%matplotlib inline
path = '/Users/cm/Desktop/ml/2022/datasets/'

# Load data

In [5]:
def load_data(filename):
    """ Load dataset from filename
    Args:
        filename (string): Path to data
        
    Returns:
        df (DataFrame): Dataset from file
    """
    
    df = pd.read_csv(filename)
    df.drop(df.columns[0:2], axis=1, inplace=True)
    
    return df

filename = 'autodf.csv'
df = load_data(path+filename)
df.head(3)

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0


In [6]:
df.dtypes

normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [7]:
df['num-of-cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [8]:
df['num-of-doors'].value_counts()

four    114
two      89
Name: num-of-doors, dtype: int64

# Split into Train and Test Sub-sets

In [9]:
y = df.loc[:,'highway-mpg']
df.drop(['city-mpg', 'highway-mpg'],axis=1,inplace=True)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=20)

In [11]:
# search for NaNs
for i in X_train.columns:
    nas = X_train[i].isna().sum()
    if nas>0:
        print("{0:<20}  {1}".format(i,nas))

normalized-losses     31
num-of-doors          1
bore                  4
stroke                4
horsepower            1
peak-rpm              1
price                 2


# Select columns by type
https://pbpython.com/pandas_dtypes.html

In [12]:
X_train.dtypes

normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
price                float64
dtype: object

In [13]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html
X_train.select_dtypes(include='number').columns

Index(['normalized-losses', 'wheel-base', 'length', 'width', 'height',
       'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-ratio',
       'horsepower', 'peak-rpm', 'price'],
      dtype='object')

In [14]:
X_train.select_dtypes(exclude='number').columns

Index(['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
       'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders',
       'fuel-system'],
      dtype='object')

# Build custom transformer
- You may want to add some features that make sense to the data and coudl help.

In [15]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# define column indexes
horsepower_idx, engine_size_idx, length_idx, width_idx, height_idx, curb_weight_idx = 0,1,2,3,4,5

# clase definida por nosotros
class CombineAttributes(BaseEstimator, TransformerMixin):
    """ Custom transformer. Adds horsepower to weight ratio
        and optionaly the volume of the car.
        
    Args:
        add_volume (bool): Add volume to the dataset

    Returns:
        array with original and added features.
    """
    # constructor
    def __init__(self, add_volume=False):
        self.add_volume = add_volume

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        hp_to_weight_ratio = X[:,horsepower_idx]/X[:,curb_weight_idx]
        
        if self.add_volume:
            volume = X[:,length_idx]*X[:,width_idx]*X[:,height_idx]
            return np.c_[X, hp_to_weight_ratio, volume]
        
        else:
            return np.c_[X, hp_to_weight_ratio]

In [16]:
# test our custom transformer
# ---------------------------

# create instance
# objeto (GALLETA)   # clase (MOLDE)
combine_attributes = CombineAttributes(add_volume=True)

# transform numeric data
df_num_combined = combine_attributes.transform(X_train.select_dtypes(include='number').values)

# review output
df_num_combined[0,:]

array([           nan, 9.59000000e+01, 1.73200000e+02, 6.63000000e+01,
       5.02000000e+01, 2.81800000e+03, 1.56000000e+02, 3.59000000e+00,
       3.86000000e+00, 7.00000000e+00, 1.45000000e+02, 5.00000000e+03,
       1.27640000e+04,            nan, 5.76454632e+05])

# Build Pipeline
- Advantages
    - Automated workflow
    - Production Ready
    - Using code with standards
    - Easier to debug 

### Numeric

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
                            ('imputer', SimpleImputer(strategy="median")),
                            ('attribs_adder', CombineAttributes()),
                            ('std_scaler', StandardScaler()),
                        ])

In [19]:
# test numeric pipeline
# ---------------------
df_num_tr = num_pipeline.fit_transform(df.select_dtypes(include='number'))
# review output from pipeline
df_num_tr[0,:]

array([-0.17650339, -1.6907718 , -0.42652147, -0.84478235, -2.0204173 ,
       -0.01456628,  0.07444893,  0.52048915, -1.84134525, -0.28834891,
        0.17330945, -0.26498274,  0.0438547 , -0.24379062])

### Categorical
Check for categorical imputation -> https://scikit-learn.org/stable/modules/impute.html

In [20]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
                        ('imputer', SimpleImputer(strategy="most_frequent")),
                        ('oh_enc', OneHotEncoder()),
                        ])

In [21]:
# test categorical pipeline
# --------------------------
df_cat_tr = cat_pipeline.fit_transform(df.select_dtypes(exclude='number'))

# review output from pipeline
df_cat_tr.toarray()[0,:]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0.])

### Full Pipeline
- Merge both sub pipelines in a single pipeline

In [22]:
from sklearn.compose import ColumnTransformer

num_attribs = df.select_dtypes(include='number').columns
cat_attribs = df.select_dtypes(exclude='number').columns

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [None]:
#fit
# econder(data_train) -> ['a','b','c']
#                         [1,2,3]
#                         [1,0,0] [0,1,0] [0,0,1]
# enoder('d') ??? 

In [23]:
# test full pipeline
# -------------------

full_pipeline.fit(df)
X_train_processed = full_pipeline.transform(X_train)
#X_train_processed = full_pipeline.fit_transform(X_train)

print(X_train_processed.shape)

X_train_processed[0,:]

(143, 74)


array([-0.17650339, -0.47553718, -0.06900603,  0.18327172, -1.44607126,
        0.50525559,  0.70033658,  0.96461101,  1.93021627, -0.79310108,
        1.03552704, -0.26498274, -0.04914926, -0.52433745,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  

# Use pipeline and model

In [85]:
# Train a model
# -----------------------
# use a Random Forest regressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10, 12, 20], 'max_features': [2, 3, 4, 8]},
  ]

# create instance
forest_reg = RandomForestRegressor(random_state=42)

# search for best solution from combinations
grid_search = GridSearchCV(forest_reg, param_grid, cv=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train_processed, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(random_state=42),
             param_grid=[{'max_features': [2, 4, 6, 8],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4, 8],
                          'n_estimators': [3, 10, 12, 20]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [86]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 10}

In [87]:
best_model = grid_search.best_estimator_
best_model

RandomForestRegressor(max_features=8, n_estimators=10, random_state=42)

In [89]:
# Test model and pipeline
# -----------------------
from sklearn.metrics import mean_squared_error

# NO ES fit_transform() 
X_test_processed = full_pipeline.transform(X_test)
final_predictions = best_model.predict(X_test_processed)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

3.5424613276738137

# Persist a model

In [90]:
full_pipeline_with_predictor = Pipeline([
        ("preparation", full_pipeline),
        ("rf_reg", RandomForestRegressor(max_features=8, n_estimators=10))
    ])


full_pipeline_with_predictor.fit(df, y)

full_pipeline_with_predictor.predict(X_test)

array([25.8, 41.9, 32.8, 27.2, 37.8, 35.5, 37.9, 24. , 23.4, 27.3, 41.2,
       29.6, 27.2, 37. , 31. , 38.3, 27.2, 37. , 31.7, 27.7, 31.9, 33.4,
       25. , 24.8, 23.8, 28.8, 37.5, 32.2, 24. , 34.4, 30.1, 37.9, 45.1,
       31.4, 45.4, 32.2, 31.9, 24.1, 30.4, 31.9, 42.4, 37. , 36.1, 29.9,
       38.9, 23.9, 27.6, 31.7, 24. , 24.5, 30. , 25. , 49.3, 45. , 39.8,
       20. , 33. , 24.2, 30.4, 29.6, 40.1, 26.7])

In [93]:
my_model = full_pipeline_with_predictor

import joblib
# persist
joblib.dump(my_model, "my_pipe_predictor.pkl")

['my_pipe_predictor.pkl']

In [94]:
!ls

Untitled.ipynb                     week_1_getting_started_colab.ipynb
Untitled1.ipynb                    week_2_1_data_exploration.ipynb
[34mdata[m[m                               week_2_2_creating_models.ipynb
hist.png                           week_3_1_pipelines.ipynb
my_pipe_predictor.pkl


In [95]:
# load from disk
my_model_loaded = joblib.load("my_pipe_predictor.pkl")

In [96]:
X_test[:2]

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,price
5,,audi,gas,std,two,sedan,fwd,front,99.8,177.3,...,ohc,five,136,mpfi,3.19,3.4,8.5,110.0,5500.0,15250.0
44,,isuzu,gas,std,two,sedan,fwd,front,94.5,155.9,...,ohc,four,90,2bbl,3.03,3.11,9.6,70.0,5400.0,


In [97]:
my_model_loaded.predict(X_test[:2])

array([25.8, 41.9])