In [42]:
import pandas as pd
df = pd.read_csv('housing.csv')

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [44]:
df['total_bedrooms'] = df['total_bedrooms'].fillna(df['total_bedrooms'].mean())


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
import joblib
from sklearn.pipeline import Pipeline

In [47]:
x = df.drop('median_house_value', axis =1)
y = df['median_house_value']

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


In [49]:
sc = StandardScaler()
oe = OneHotEncoder()
numerical_features = x_train.select_dtypes(include=['float']).columns
cat_features = x_train.select_dtypes(include=['object']).columns

In [50]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', sc, numerical_features),
        ('cat', oe, cat_features),
    ]
)

In [51]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [52]:
model.fit(x_train, y_train)


In [53]:
predictions = model.predict(x_test)

In [54]:
mse = mean_squared_error(y_test, predictions)

In [55]:
import numpy as np
print('MSE', np.sqrt(mse))

MSE 48358.60852148911


In [56]:
joblib.dump(model, 'house_pricing.joblib')

['house_pricing.joblib']

In [57]:
new_house = pd.DataFrame(
    {
        'longitude': [-122.23],
        'latitude': [37.88],
        'housing_median_age': [1],
        'total_rooms': [6],
        'total_bedrooms': [3],
        'population': [6],
        'households': [126],
        'median_income': [7.0],
        'ocean_proximity': ['INLAND']
    }
)
        

In [58]:
loaded_model = joblib.load('house_pricing.joblib')

In [59]:
predictions = model.predict(new_house)

In [60]:
print('Price:', predictions[0])

Price: 309845.0
