In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
import preprocessing as pp

In [4]:
df = pd.read_csv('housing_train.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.15,34.14,45.0,543.0,191.0,454.0,181.0,2.3,55000.0,<1H OCEAN
1,-118.14,33.87,44.0,1661.0,315.0,985.0,319.0,4.3942,219500.0,<1H OCEAN
2,-122.31,37.6,34.0,3225.0,726.0,1958.0,656.0,3.6811,273000.0,NEAR BAY
3,-122.13,37.72,25.0,1134.0,153.0,340.0,171.0,6.5095,371200.0,NEAR BAY
4,-117.94,33.82,24.0,4735.0,955.0,2600.0,868.0,5.0764,228600.0,<1H OCEAN


In [5]:
target = 'median_house_value'
y = df[target]
X = df.drop(columns=target)

In [48]:
categorical_cols = X.select_dtypes(exclude=np.number).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', min_frequency = 0.12))
])

numerical_cols = X.select_dtypes(include=np.number).columns
numerical_transformer = Pipeline(steps=[
    ('new_features', pp.New_features()),
    ('imputer', SimpleImputer()),
    ('scaler',  StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [7]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import cross_val_score

In [18]:
model = RandomForestRegressor()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [20]:
-(cross_val_score(estimator=my_pipeline, X=X, y=y, scoring='neg_mean_absolute_error', cv=5).mean())

28911.880557678618

In [21]:
model = LinearRegression()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

-(cross_val_score(estimator=my_pipeline, X=X, y=y, scoring='neg_mean_absolute_error', cv=5).mean())

44014.30262741281

In [8]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', pp.ModSwitcher())
                          ])

In [17]:
param_grid = [{'preprocessor__num__imputer__strategy':['mean','median'],
              'model__estimator': [RandomForestRegressor()],
              'model__estimator__n_estimators': [100, 200, 250, 300]},
              
              {'preprocessor__num__imputer__strategy':['mean','median'],
              'model__estimator': [DecisionTreeRegressor()],
              'model__estimator__max_depth': [5,7,10,20,50,100,150]},
              
              {'preprocessor__num__imputer__strategy':['mean','median'],
              'model__estimator': [LinearRegression()]}
             ]

In [18]:
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)

In [19]:
gs.fit(X,y)

In [21]:
gs.best_params_

{'model__estimator': RandomForestRegressor(n_estimators=250),
 'model__estimator__n_estimators': 250,
 'preprocessor__num__imputer__strategy': 'median'}

In [26]:
-(gs.best_score_)

28736.772877327294

In [28]:
rs = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, scoring='neg_mean_absolute_error', cv=5)

In [29]:
rs.fit(X,y)

In [32]:
rs.best_params_

{'preprocessor__num__imputer__strategy': 'median',
 'model__estimator__n_estimators': 200,
 'model__estimator': RandomForestRegressor(n_estimators=200)}

In [34]:
-rs.best_score_

28736.3302692848

In [35]:
df_test = pd.read_csv('housing_test.csv')
df_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.17,34.67,5.0,8352.0,1555.0,3723.0,1389.0,4.5659,140300.0,INLAND
1,-117.24,34.04,4.0,4289.0,682.0,1981.0,705.0,5.3366,165100.0,INLAND
2,-118.4,33.85,29.0,2085.0,533.0,919.0,489.0,5.6017,430000.0,<1H OCEAN
3,-118.28,33.8,38.0,1471.0,329.0,1207.0,335.0,4.0,165500.0,<1H OCEAN
4,-122.48,37.76,50.0,2236.0,484.0,1171.0,467.0,4.0977,322100.0,NEAR BAY


In [36]:
target = 'median_house_value'
y_test = df_test[target]
X_test = df_test.drop(columns=target)

In [37]:
preds = rs.predict(X_test)

In [38]:
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 30227.157176598837


In [49]:
X_preprocessed = preprocessor.fit_transform(X)
model = RandomForestRegressor(n_estimators=200)
model.fit(X_preprocessed,y)

In [51]:
#plt.barh(preprocessor.get_feature_names_out(), model.feature_importances_)

MAE rf: 26890.958650123197 <br>
MAE dt: 36581.88456829316 <br>
MAE lr: 41446.41761813622 <br>