In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
import preprocessing as pp

In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import cross_val_score

In [5]:
df = pd.read_csv('housing_train.csv')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-118.16,33.84,36.0,2444.0,432.0,1199.0,424.0,4.1538,218800.0,<1H OCEAN
1,-121.83,37.34,26.0,1848.0,339.0,1952.0,327.0,4.087,182500.0,<1H OCEAN
2,-118.01,34.12,32.0,1937.0,332.0,922.0,340.0,3.94,278400.0,INLAND
3,-116.31,33.73,19.0,12467.0,2508.0,4086.0,1761.0,3.2846,131900.0,INLAND
4,-118.17,33.92,36.0,2447.0,503.0,1532.0,498.0,4.3667,171800.0,<1H OCEAN


In [6]:
target = 'median_house_value'
y = df[target]
X = df.drop(columns=target)

In [22]:
categorical_cols = X.select_dtypes(exclude=np.number).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', min_frequency = 0.12))
])

numerical_cols = X.select_dtypes(include=np.number).columns
numerical_transformer = Pipeline(steps=[
    ('new_features', pp.New_features()),
    ('outliers', pp.Trim_outliers(na=True, factor=1000)),
    ('imputer', SimpleImputer()),
    ('scaler',  StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [23]:
model = RandomForestRegressor()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

In [24]:
-(cross_val_score(estimator=my_pipeline, X=X, y=y, scoring='neg_mean_absolute_error', cv=5).mean())

30459.500584497546

Dropping rows : 28911 <br>
No rows dropping : 30584 <br>
Dropping target : 30450

In [25]:
model = LinearRegression()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

-(cross_val_score(estimator=my_pipeline, X=X, y=y, scoring='neg_mean_absolute_error', cv=5).mean())

48391.12843180025

In [26]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', pp.ModSwitcher())
                          ])

In [27]:
param_grid = [{'preprocessor__num__imputer__strategy':['mean','median'],
               'preprocessor__num__outliers__na':[True,False],
               'preprocessor__num__outliers__factor':[3,6,9,13,1000],
              'model__estimator': [RandomForestRegressor()],
              'model__estimator__n_estimators': [100, 200, 250, 300]},
              
              {'preprocessor__num__imputer__strategy':['mean','median'],
               'preprocessor__num__outliers__na':[True,False],
               'preprocessor__num__outliers__factor':[3,6,9,13,1000],
              'model__estimator': [DecisionTreeRegressor()],
              'model__estimator__max_depth': [5,7,10,20,50,100,150]},
              
              {'preprocessor__num__imputer__strategy':['mean','median'],
               'preprocessor__num__outliers__na':[True,False],
               'preprocessor__num__outliers__factor':[3,6,9,13,1000],
              'model__estimator': [LinearRegression()]}
             ]

In [28]:
gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)

In [29]:
gs.fit(X,y)

In [30]:
gs.best_params_

{'model__estimator': RandomForestRegressor(n_estimators=300),
 'model__estimator__n_estimators': 300,
 'preprocessor__num__imputer__strategy': 'mean',
 'preprocessor__num__outliers__factor': 1000,
 'preprocessor__num__outliers__na': True}

In [31]:
-(gs.best_score_)

30257.98733006048

In [32]:
rs = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, scoring='neg_mean_absolute_error', cv=5)

In [33]:
rs.fit(X,y)

In [34]:
rs.best_params_

{'preprocessor__num__outliers__na': False,
 'preprocessor__num__outliers__factor': 1000,
 'preprocessor__num__imputer__strategy': 'mean',
 'model__estimator__n_estimators': 100,
 'model__estimator': RandomForestRegressor()}

In [35]:
-rs.best_score_

30387.094891237397

In [36]:
df_test = pd.read_csv('housing_test.csv')
df_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.47,37.74,52.0,3688.0,640.0,1605.0,567.0,4.9537,365600.0,NEAR BAY
1,-117.9,34.06,35.0,1313.0,194.0,599.0,209.0,7.5,287200.0,<1H OCEAN
2,-118.17,34.04,38.0,385.0,102.0,402.0,95.0,1.625,129700.0,<1H OCEAN
3,-117.89,33.77,35.0,1799.0,343.0,1239.0,368.0,3.9219,189600.0,<1H OCEAN
4,-116.9,32.9,19.0,3090.0,552.0,1621.0,520.0,4.0806,189200.0,<1H OCEAN


In [38]:
target = 'median_house_value'
y_test = df_test[target]
X_test = df_test.drop(columns=target)

In [39]:
preds = rs.predict(X_test)

In [40]:
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 29641.96653343023


In [32]:
pipeline.fit(X,y,model__estimator=RandomForestRegressor(n_estimators=200))

In [33]:
preds = pipeline.predict(X_test)

In [34]:
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 29718.2707122093


In [51]:
#plt.barh(preprocessor.get_feature_names_out(), model.feature_importances_)

When dropping all outliers: <br>

MAE rf: 26890.958650123197 <br>
MAE dt: 36581.88456829316 <br>
MAE lr: 41446.41761813622 <br>