In [11]:
import pandas as pd
import numpy as np

# load data
train_data = pd.read_csv('../data/train.csv')

# print(train_data.head())

X_train = train_data.drop('median_house_value', axis=1)
y_train = train_data['median_house_value'].copy()

# create a validation set
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

num_features = X_train.select_dtypes(include=np.number).columns
cat_features = X_train.select_dtypes(include='object').columns

In [8]:
# import pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

num_pipeline = Pipeline([
    ('num_imputer', SimpleImputer(strategy='mean')),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('cat_imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])

# import ColumnTransformer
from sklearn.compose import ColumnTransformer

pre_processing_pipeline = ColumnTransformer([
    ('num_pipe', num_pipeline, num_features),
    ('cat_pipe', cat_pipeline, cat_features)
])

pre_processing_pipeline

In [10]:
# import model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

model_pipeline = Pipeline([
    ('pre_processing', pre_processing_pipeline),
    ('model', LinearRegression())
])

model_pipeline

In [13]:
model = model_pipeline.fit(X_train, y_train)

# validation
from sklearn.metrics import root_mean_squared_error

y_pred = model.predict(X_val)
rmse = np.sqrt(root_mean_squared_error(y_val, y_pred))

print(rmse)


264.7944008882846


In [16]:
# save model
import joblib

joblib.dump(model, '../models/model_with_pipeline.pkl')

['../models/model_with_pipeline.pkl']