In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from joblib import load

In [2]:
pd.options.display.max_rows = 999 
pd.options.display.max_columns = 999 

In [3]:
csv_in = '../ex08/data/housing.csv'
df = pd.read_csv(csv_in,sep=',',skiprows=0,header=0)
print(df.shape)
print(df.info())
display(df.head())

(20640, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
None


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
df = df.dropna()
display(df)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [5]:
X = df.drop(columns='median_house_value')
y = df['median_house_value']
print('X:',X.shape)
display(X.head())
print('y:',y.shape)
print(y.head())

X: (20433, 9)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,NEAR BAY


y: (20433,)
0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: median_house_value, dtype: float64


In [6]:
X['ocean_proximity'] = X['ocean_proximity'].replace({'ISLAND':1,'NEAR BAY':2,'NEAR OCEAN':3,'<1H OCEAN':4,'INLAND':5,})
display(X.head())

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,2
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,2
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,2
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,2


In [7]:
X_dumm = pd.get_dummies(X, drop_first=True)
print('X_dumm:', X_dumm.shape)
display(X_dumm.head())

X_dumm: (20433, 9)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,2
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,2
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,2
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,2
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,2


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_dumm, y, test_size=0.2, random_state=9)

In [9]:
print(X_test.shape)
print(y_test.shape)

(4087, 9)
(4087,)


In [10]:
tag = 'house'
model_file = 'rfr_{}.joblib'.format(tag)
rfr = load(model_file)
print(rfr)

RandomForestRegressor(n_estimators=1000, random_state=9)


In [11]:
y_test_pred = rfr.predict(X_test)

In [12]:
mse = mean_squared_error(y_test, y_test_pred)
print('MSE, RMSE for test data:', mse, np.sqrt(mse))

MSE, RMSE for test data: 2548273192.929617 50480.42385845841


In [13]:
X_test = pd.DataFrame([[-122.2,37.8,30.0,1400,200,700,200,7,3]],columns=X_dumm.columns)
print('X for prediction:')
display(X_test)

X for prediction:


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
0,-122.2,37.8,30.0,1400,200,700,200,7,3


In [14]:
y_test_pred = rfr.predict(X_test)
print('Predicted y:')
print(y_test_pred)

Predicted y:
[378262.383]
