In [1]:
import pandas as pd
import numpy as np
import preprocessing as pp
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

In [2]:
df_train = pd.read_csv('housing_train.csv')
df_test = pd.read_csv('housing_test.csv')

In [3]:
target = 'median_house_value'
y_train = df_train[target]
X_train = df_train.drop(columns=target)
y_test = df_test[target]
X_test = df_test.drop(columns=target)

In [4]:
categorical_cols = X_train.select_dtypes(exclude=np.number).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', min_frequency = 0.12))
])

numerical_cols = X_train.select_dtypes(include=np.number).columns
numerical_transformer = Pipeline(steps=[
    ('new_features', pp.New_features()),
    ('outliers', pp.Trim_outliers(na=False, factor=1000)),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler',  StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [5]:
model = RandomForestRegressor(n_estimators=300)
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [6]:
my_pipeline.fit(X_train,y_train)
preds = my_pipeline.predict(X_test)
score = mean_absolute_error(y_test, preds)
print('MAE:', score)
r2score = r2_score(y_test, preds)
print('R2:', r2score)

MAE: 29521.891392118865
R2: 0.8432022070449338
