In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Import Dataset
dataset = pd.read_csv("housing.csv")
dataset.head() # Print first 5 observations from dataset using head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280,565,259,3.8462,NEAR BAY,342200


In [3]:
# Check in which column contains nan values
dataset.isnull().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms         True
population            False
households            False
median_income         False
ocean_proximity       False
median_house_value    False
dtype: bool

In [4]:
# Separate features and labels
features = dataset.iloc[:,:-1].values
label = dataset.iloc[:,-1].values.reshape(-1,1)
print(features[1])

[-122.22 37.86 21 7099 1106.0 2401 1138 8.3014 'NEAR BAY']


In [5]:
# Perform Imputation with strategy=mean
from sklearn.preprocessing import Imputer
imputerNaN = Imputer(missing_values="NaN",strategy="mean",axis=0)
features[:,[4]] = imputerNaN.fit_transform(features[:,[4]])
print(features[1])
print(len(features[1]))

[-122.22 37.86 21 7099 1106.0 2401 1138 8.3014 'NEAR BAY']
9




In [6]:
# Perform Label Encoding and Onehot Encding on categorical values present in the features
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
print(features[1])
print(set(features[:,8]))
features[:,8] = LabelEncoder().fit_transform(features[:,8])
print(features[1])
print(len(features[1]))
features = OneHotEncoder(categorical_features=[8]).fit_transform(features).toarray()
print(features[1])
print(len(features[1]))

[-122.22 37.86 21 7099 1106.0 2401 1138 8.3014 'NEAR BAY']
{'NEAR BAY', 'INLAND', '<1H OCEAN', 'ISLAND', 'NEAR OCEAN'}
[-122.22 37.86 21 7099 1106.0 2401 1138 8.3014 3]
9
[ 0.0000e+00  0.0000e+00  0.0000e+00  1.0000e+00  0.0000e+00 -1.2222e+02
  3.7860e+01  2.1000e+01  7.0990e+03  1.1060e+03  2.4010e+03  1.1380e+03
  8.3014e+00]
13


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
X,y=features,label # Purpose of this copying variables is that trees doesn't requires scaling while others "may be"
# Split into training set and testing set in every model building cause of "random_state" present in the "train_test_split"
from sklearn.model_selection import train_test_split as tts
X_train,X_test,y_train,y_test = tts(X,y,test_size=0.2,random_state=5)

In [8]:
# Random Forest Tree Regression
from sklearn.ensemble import RandomForestRegressor
depth = 9
no_of_estim = 10
model_random = RandomForestRegressor(n_estimators=no_of_estim, random_state=20, max_depth=depth)
model_random.fit(X_train,y_train.ravel())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=9,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=20, verbose=0,
                      warm_start=False)

In [9]:
# Perform prediction and model score
y_pred = model_random.predict(X_test)
from sklearn.metrics import r2_score
print("Model Score for Training data: {}".format(model_random.score(X_train,y_train)))
print("Model Score for Testing data: {}".format(r2_score(y_test,y_pred)))
#print("Root Mean Squared Error is {}".format(np.sqrt(mean_squared_error(y_test,y_pred))))

Model Score for Training data: 0.8206121576672762
Model Score for Testing data: 0.7754096029715711


In [13]:
# Export pickle model
import pickle
pickle.dump(model_random, open('pickle_model.pickle', 'wb'))

In [14]:
# Reload pickle model
model_loaded = pickle.load(open('pickle_model.pickle', 'rb'))

depth = model_loaded.max_depth
no_of_estim = model_loaded.n_estimators

In [15]:
from sklearn.tree import export_graphviz
from subprocess import call
fl = open("decision_tree_stat.txt", 'w')
fl.write(str(no_of_estim) + '\n')
fl.write(str(depth+1))
fl.close()
for i in range(no_of_estim):
    estimator = model_loaded.estimators_[i]
    # Export as dot file
    export_graphviz(estimator, out_file='tree.dot',
                    rounded = True, proportion = False, 
                    precision = 2, filled = True)
    filename = "tree" + str(i) + ".txt"
    print(filename)
    call(['dot', '-Tplain', 'tree.dot', '-o', filename])

tree0.txt
tree1.txt
tree2.txt
tree3.txt
tree4.txt
tree5.txt
tree6.txt
tree7.txt
tree8.txt
tree9.txt


In [54]:
import math
query = open("decision_tree_query.txt", 'w')
scaled_x_test = []
print(X_test[0])
for i in range(len(X_test[0])):
    query.write(str(math.floor(X_test[0][i]*1000)) + '\n')
query.close()

ans = open("decision_tree_sklearn_ans.txt", 'w')
ans.write(str(y_test[0]) + '\n')
ans.write(str(y_pred[0]) + '\n')
ans.close()

[ 0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  1.0000e+00 -1.1707e+02
  3.2570e+01  1.4000e+01  1.5270e+03  3.5700e+02  1.2240e+03  3.6300e+02
  2.7361e+00]


In [55]:
# Convert to text using system command (requires Graphviz)
#from subprocess import call
#call(['dot', '-Tplain', 'tree.dot', '-o', 'tree.txt'])
print(y_pred)

[133242.04011202 188795.91438441  96736.72755365 ... 280260.40607462
 224256.7400748   87405.09663664]
