In [108]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn import tree

In [90]:
df = pd.read_csv("housing.csv")

In [91]:
df= df[(df["ocean_proximity"]=='<1H OCEAN') | (df["ocean_proximity"]=='INLAND') ]

In [92]:
df  = df.fillna(0)

In [93]:
df["median_house_value"] = np.log1p(df["median_house_value"])

In [94]:
df_fulltrain, df_test = train_test_split(df, test_size=.2, random_state=1)
df_train, df_val = train_test_split(df_fulltrain, test_size=.25, random_state=1)

In [95]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [96]:
y_train = df_train["median_house_value"].values
y_val = df_val["median_house_value"].values
y_test = df_test["median_house_value"].values

In [97]:
del df_train["median_house_value"]
del df_val["median_house_value"]
del df_test["median_house_value"]

In [98]:
train_dicts = df_train.to_dict(orient='records')

In [99]:
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)

In [103]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [114]:
print(tree.export_text(dt, feature_names=dv.get_feature_names_out()))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



In [121]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [106]:
#y_pred = dt.predict_proba(X_val)[:, 1]

AttributeError: 'DecisionTreeRegressor' object has no attribute 'predict_proba'

In [None]:
#y_pred

In [None]:
#roc_auc_score(y_val, y_pred)

In [122]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [130]:
rf = RandomForestRegressor(n_estimators=10, random_state=1)
rf.fit(X_train, y_train)

In [147]:
y_pred = rf.predict(X_val)

In [148]:
y_pred

array([12.23749287, 12.04892266, 11.62622693, ..., 11.74127738,
       11.15238041, 11.69322207])

In [149]:
mean_squared_error(y_val, y_pred, squared=False)

0.23365273265963643

In [138]:
est = np.arange(10,210, 10)

In [153]:
res = [] 

for max_depth in [10, 15, 20, 25]:
    for e in est:
        rmse_list = []
        rf = RandomForestRegressor(n_estimators=e, random_state=1, n_jobs=-1, max_depth=max_depth)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse_list.append(mean_squared_error(y_val, y_pred, squared=False))
    res.append((max_depth, np.average(rmse_list)))
    
    
    

In [154]:
res

[(10, 0.2447304368581782),
 (15, 0.23457174272618955),
 (20, 0.23395591092211357),
 (25, 0.23376420484314395)]

In [156]:
# hw 6 question 5
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1, max_depth=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

0.24485321712663938

In [169]:
#rf.feature_importances_
rf.feature_importances_

array([0.01502087, 0.03030503, 0.10195626, 0.08627638, 0.3355917 ,
       0.29247297, 0.07380644, 0.02715851, 0.01589402, 0.02151783])

In [170]:
d = {}
for c, f in zip(df_train.columns, rf.feature_importances_):
    d[c]=f

In [171]:
d

{'longitude': 0.015020868392390856,
 'latitude': 0.030305029123174957,
 'housing_median_age': 0.10195626000240125,
 'total_rooms': 0.08627637578575545,
 'total_bedrooms': 0.33559170042109715,
 'population': 0.29247296545343104,
 'households': 0.07380644294558006,
 'median_income': 0.027158512272116596,
 'ocean_proximity': 0.015894018592137876}

In [172]:
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/6d/d1/3e954de1d492129710e8625349a7b86eb287a4f413c5b5c15522f89a6c04/xgboost-2.0.0-py3-none-macosx_12_0_arm64.whl.metadata
  Downloading xgboost-2.0.0-py3-none-macosx_12_0_arm64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.0-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.0.0


In [204]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

In [205]:
import xgboost as xgb

In [206]:
features = dv.get_feature_names_out()


In [207]:
l_features = list(features)

In [208]:
#regex = re.compile(r"\[|\]|<", re.IGNORECASE)
l_features = [x.replace("<", "LT") for x in l_features]

In [209]:
watchlist = [(dtrain, "train"), (dval, "val")]

In [210]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=l_features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=l_features)

In [211]:
xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)

[0]	train-rmse:0.52449	val-rmse:0.52045
[1]	train-rmse:0.48736	val-rmse:0.48443
[2]	train-rmse:0.45433	val-rmse:0.45293
[3]	train-rmse:0.42533	val-rmse:0.42550
[4]	train-rmse:0.39987	val-rmse:0.40144
[5]	train-rmse:0.37822	val-rmse:0.38151
[6]	train-rmse:0.35887	val-rmse:0.36393
[7]	train-rmse:0.34177	val-rmse:0.34838
[8]	train-rmse:0.32701	val-rmse:0.33492
[9]	train-rmse:0.31411	val-rmse:0.32333
[10]	train-rmse:0.30326	val-rmse:0.31427
[11]	train-rmse:0.29355	val-rmse:0.30615
[12]	train-rmse:0.28519	val-rmse:0.29922
[13]	train-rmse:0.27760	val-rmse:0.29269
[14]	train-rmse:0.27116	val-rmse:0.28796
[15]	train-rmse:0.26538	val-rmse:0.28380
[16]	train-rmse:0.26012	val-rmse:0.27970
[17]	train-rmse:0.25583	val-rmse:0.27661
[18]	train-rmse:0.25203	val-rmse:0.27388
[19]	train-rmse:0.24818	val-rmse:0.27123
[20]	train-rmse:0.24512	val-rmse:0.26882
[21]	train-rmse:0.24149	val-rmse:0.26669
[22]	train-rmse:0.23895	val-rmse:0.26478
[23]	train-rmse:0.23594	val-rmse:0.26331
[24]	train-rmse:0.23305	va

<xgboost.core.Booster at 0x2d91f4d50>

In [None]:
# .3 ETA = val-rmse:0.22862

In [None]:
# .1 ETA = val-rmse:0.23209