In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('housing_train.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.16,33.84,36.0,2444.0,432.0,1199.0,424.0,4.1538,218800.0,<1H OCEAN
1,-121.83,37.34,26.0,1848.0,339.0,1952.0,327.0,4.087,182500.0,<1H OCEAN
2,-118.01,34.12,32.0,1937.0,332.0,922.0,340.0,3.94,278400.0,INLAND
3,-116.31,33.73,19.0,12467.0,2508.0,4086.0,1761.0,3.2846,131900.0,INLAND
4,-118.17,33.92,36.0,2447.0,503.0,1532.0,498.0,4.3667,171800.0,<1H OCEAN


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
target = 'median_house_value'
y_train = df[target]
X_train = df.drop(columns=target)

In [7]:
# numerical data
numerical_cols = X_train.select_dtypes(include=np.number).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    #('scaler', StandardScaler())
])

In [8]:
# Preprocessing for categorical data
categorical_cols = X_train.select_dtypes(exclude=np.number).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    #('scaler', StandardScaler())
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [10]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [11]:
model = RandomForestRegressor(n_estimators = 300)

In [12]:
df_test = pd.read_csv('housing_test.csv')

target = 'median_house_value'
y_test = df_test[target]
X_test = df_test.drop(columns=target)

### Score basic preprocessing

In [13]:
import numpy as np

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 31003.16097383721


In [14]:
my_pipeline.score(X_test,y_test)

0.8287250281479022

### Score predicting the mean

In [16]:
y_mean = np.repeat(y_test.mean(), len(y_test))

In [17]:
score = mean_absolute_error(y_test, y_mean)
print('MAE:', score)

MAE: 90767.21115510073


In [13]:
# 0.8251377067228349 Baseline 
# 0.7902212066730406 Engineering 35284.10970930233