In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data_exploration/housing.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
df2 = df.copy()
df2['house_by_pop'] = df["households"]/df["population"]
df2['rooms_by_pop'] = df["total_rooms"]/df["population"]
df2['ll'] = df['longitude'] + df['latitude']

df2.drop(columns=['households','population','longitude','latitude', 'total_rooms'], inplace=True)

In [6]:
df.shape, df2.shape

((20640, 10), (20640, 8))

In [7]:
target = 'median_house_value'
y = df[target]
X = df.drop(columns=target)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

In [8]:
# numerical data
numerical_cols = X.select_dtypes(include=np.number).columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
    #('scaler', StandardScaler())
])

In [9]:
# Preprocessing for categorical data
categorical_cols = X.select_dtypes(exclude=np.number).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    #('scaler', StandardScaler())
])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [11]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [12]:
model = RandomForestRegressor(n_estimators = 100)

In [13]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 31470.70502180233


In [14]:
my_pipeline.score(X_test,y_test)

0.8251377067228349

In [15]:
# 0.8251377067228349 Baseline
# 0.7893809891823669 Engineering