In [1]:
# Import Libraries
import numpy as np
import pandas as pd

In [2]:
# Import Dataset
dataset = pd.read_csv("housing.csv")
dataset.head() # Print first 5 observations from dataset using head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [3]:
# Check in which column contains nan values
dataset.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
ocean_proximity       False
median_house_value    False
dtype: bool

In [5]:
# Separate features and labels
features = dataset.iloc[:,:-1].values
label = dataset.iloc[:,-1].values.reshape(-1,1)
print("Sample feature vector: ", features[1])

Sample feature vector:  [-122.22 37.86 21 7099 1106.0 2401 1138 8.3014 'NEAR BAY']


In [6]:
# Perform Imputation with strategy=mean
#from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
imputerNaN = SimpleImputer(strategy="mean")
features[:,[4]] = imputerNaN.fit_transform(features[:,[4]])

In [7]:
# Perform Label Encoding and Onehot Encoding on categorical values present in the features
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
print("Features requiring encoding: ", set(features[:,8]))
features[:,8] = LabelEncoder().fit_transform(features[:,8])
print("Sample feature vector: ", features[1])
print("No. of features: ", len(features[1]))

from sklearn.compose import ColumnTransformer 
ct = ColumnTransformer([("Name", OneHotEncoder(),[8])], remainder="passthrough") # The last arg ([0]) is the list of columns you want to transform in this step
features = ct.fit_transform(features).tolist()

#features = OneHotEncoder(categorical_features=[8]).fit_transform(features).toarray()
print("Sample feature vector after encoding: ", features[1])
print("No. of features after encoding: ", len(features[1]))

Features requiring encoding:  {'NEAR BAY', 'NEAR OCEAN', 'INLAND', 'ISLAND', '<1H OCEAN'}
Sample feature vector:  [-122.22 37.86 21 7099 1106.0 2401 1138 8.3014 3]
No. of features:  9
Sample feature vector after encoding:  [0.0, 0.0, 0.0, 1.0, 0.0, -122.22, 37.86, 21, 7099, 1106.0, 2401, 1138, 8.3014]
No. of features after encoding:  13


In [15]:
X,y=features,label # Purpose of this copying variables is that trees doesn't requires scaling while others "may be"
# Split into training set and testing set in every model building cause of "random_state" present in the "train_test_split"
from sklearn.model_selection import train_test_split as tts
X_train,X_test,y_train,y_test = tts(X,y,test_size=0.2,random_state=5)

# Random Forest Tree Regression
from sklearn.ensemble import RandomForestRegressor
depth = 9
no_of_estim = 10
model_random = RandomForestRegressor(n_estimators=no_of_estim, random_state=20, max_depth=depth)
model_random.fit(X_train,y_train.ravel())

# Perform prediction and model score
y_pred = model_random.predict(X_test)
from sklearn.metrics import r2_score
print("Model Score for Training data: {}".format(model_random.score(X_train,y_train)))
print("Model Score for Testing data: {}".format(r2_score(y_test,y_pred)))
#print("Root Mean Squared Error is {}".format(np.sqrt(mean_squared_error(y_test,y_pred))))

Model Score for Training data: 0.8206121576672761
Model Score for Testing data: 0.7754096029715711


In [11]:
# Export pickle model
import pickle
pickle.dump(model_random, open('pickle_model.pickle', 'wb'))

In [16]:
test_input = np.array(X_test[0])
test_output = y_pred[0]

print("Test input: ", test_input)
print("Expected output: ", test_output)

with open('input.npy', 'wb') as f:
    np.save(f, test_input)
print("Dumped input as np array in input.npy")


Test input:  [ 0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  1.0000e+00 -1.1707e+02
  3.2570e+01  1.4000e+01  1.5270e+03  3.5700e+02  1.2240e+03  3.6300e+02
  2.7361e+00]
Expected output:  133242.04011201978
Dumped input as np array in input.npy
