In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from IPython.display import display


In [None]:
data_url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv"


In [None]:
# !wget $data_url
# 

In [None]:
df = pd.read_csv('housing.csv')
df.head()

In [None]:
# To pass in multiple statements, we need to use | instead of 'or' because this is pandas
# these are the bitwise operators for pandas - | for or, & for and
df_subset = df[(df['ocean_proximity'] == '<1H OCEAN') | (df['ocean_proximity'] == 'INLAND')]

In [None]:
df.isnull().sum()

In [None]:
df_subset_filled = df_subset.fillna(0)

In [None]:
df_full_train, df_test = train_test_split(df_subset_filled, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train.head()

In [None]:
y_train_orig = df_train.median_house_value
y_val_orig = df_val.median_house_value
y_test_orig = df_test.median_house_value


In [None]:
# apply log transform to our y value == median_house_value
# np.log1p(y_value)

y_train = np.log1p(y_train_orig)
y_val = np.log1p(y_val_orig)
y_test = np.log1p(y_test_orig)

In [None]:
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [None]:
df_train.head()

In [None]:
# Turn into matrices using DictVectorizer
from sklearn.feature_extraction import DictVectorizer
train_dicts = df_train.to_dict(orient='records')


In [None]:
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)

In [None]:
feature_names = list(dv.get_feature_names_out())
feature_names

In [None]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [None]:
from sklearn.tree import DecisionTreeRegressor


In [None]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [None]:
from sklearn import tree
from sklearn.tree import export_text

In [None]:
print(export_text(dt, feature_names=feature_names))
# ocean proximity is used for splitting the data

In [None]:
tree.plot_tree(dt)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=1)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_val)

In [None]:
# calculate RMSE score
from sklearn.metrics import mean_squared_error


In [None]:
mean_squared_error(y_val, y_pred, squared=False)

In [None]:
scores = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    scores.append((n,  round(rmse, 3)))
    

In [None]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])

In [None]:
df_scores

In [None]:
# 