In [2]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [3]:
import pandas as pd
import numpy as np

TARGET_COLUMN = "median_house_value"

In [4]:
init_prices_df = pd.read_csv("housing.csv")

init_prices_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


## Prepare data

In [5]:
# Leave only required rows

filter = init_prices_df["ocean_proximity"].isin(["<1H OCEAN", "INLAND"])
prepared_init_prices_df = init_prices_df[filter]
prepared_init_prices_df.reset_index(inplace=True, drop=True)

prepared_init_prices_df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0,<1H OCEAN
1,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0,<1H OCEAN
2,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0,<1H OCEAN
3,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1H OCEAN
4,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15682,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
15683,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
15684,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
15685,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [6]:
# Filling missed values with zeros

prepared_init_prices_df["total_bedrooms"].fillna(0, inplace=True)

prepared_init_prices_df.count()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prepared_init_prices_df["total_bedrooms"].fillna(0, inplace=True)


longitude             15687
latitude              15687
housing_median_age    15687
total_rooms           15687
total_bedrooms        15687
population            15687
households            15687
median_income         15687
median_house_value    15687
ocean_proximity       15687
dtype: int64

In [7]:
# Apply logariphm function to target cell to avoid tail distribution

prepared_init_prices_df[TARGET_COLUMN] = np.log1p(prepared_init_prices_df[TARGET_COLUMN])

prepared_init_prices_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prepared_init_prices_df[TARGET_COLUMN] = np.log1p(prepared_init_prices_df[TARGET_COLUMN])


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,12.973866,<1H OCEAN
1,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,12.287657,<1H OCEAN
2,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,12.419574,<1H OCEAN
3,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,12.554971,<1H OCEAN
4,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,12.287196,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...
15682,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,11.265758,INLAND
15683,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,11.252872,INLAND
15684,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,11.432810,INLAND
15685,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,11.346883,INLAND


In [8]:
from sklearn.feature_extraction import DictVectorizer
from copy import deepcopy

dv = DictVectorizer(sparse=True)

def get_features_and_target(object_to_split: "pd.Dataframe"):
    object_to_split = deepcopy(object_to_split)
    
    object_to_split.reset_index(inplace=True, drop=True)
    target = object_to_split[TARGET_COLUMN].values
    object_to_split.drop(TARGET_COLUMN, axis="columns", inplace=True)
    
    features_dict = object_to_split.to_dict(orient="records")
    features_dict = dv.fit_transform(features_dict)

    return features_dict, target
    

In [9]:
# Split dataframe
from sklearn.model_selection import train_test_split

train_df, rest_df = train_test_split(prepared_init_prices_df, test_size=0.6, random_state=1, shuffle=True)
validation_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=1, shuffle=True)

In [10]:
# Get features and targets

train_X, train_y = get_features_and_target(train_df)
validation_X, validation_y = get_features_and_target(validation_df)
test_X, test_y = get_features_and_target(test_df)

## Question1 - basic decision tree usage

In [11]:
from sklearn.tree import DecisionTreeRegressor, export_text

decision_tree_model = DecisionTreeRegressor(max_depth=1)
decision_tree_model.fit(train_X, train_y)

rules = export_text(decision_tree_model, feature_names=list(dv.get_feature_names_out()))

print(rules)


|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.29]



### Question 1 ANSWER - ocean_proximity

## Question2 - Train a basic random forest model

In [12]:
# Train model

from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

forest.fit(train_X, train_y)


In [13]:
from sklearn.metrics import mean_squared_error

# Evaluate model

predictions = forest.predict(validation_X)

score = mean_squared_error(predictions, validation_y)

score


0.06065335896305197

### Question 2 ANSWER - 0.06065335896305197

## Question 3 - Tune n_estimators

In [14]:
# calculate best n_estimator

n_estimators = range(10, 210, 10)
# n_estimators = [est for est in range(10, 210, 10)]

# scores = {}

# for n in n_estimators:
#     experiment_forest = RandomForestRegressor(n_estimators=n, random_state=1)
#     experiment_forest.fit(train_X, train_y)
#     predicted_y = experiment_forest.predict(validation_X)
#     scores[n] = mean_squared_error(predicted_y, validation_y)

# scores


### Question 3 - ANSWER - After 160

## Question 4 - calculate the best max_depth for each n_estimator

In [15]:
# calculate rmses

# max_depths = [10, 15, 20, 25]

# scores = {
#     "max_depth": [],
#     "number_of_estimators": [],
#     "mse": [],
# }

# for depth in max_depths:
#     for n in n_estimators:
#         print(f"{depth} - {n}")
        
#         experiment_forest = RandomForestRegressor(max_depth=depth, n_estimators=n, random_state=1)
#         experiment_forest.fit(train_X, train_y)
#         predicted_y = experiment_forest.predict(validation_X)
        
#         scores["max_depth"].append(depth)
#         scores["number_of_estimators"].append(n)
#         scores["mse"].append(mean_squared_error(predicted_y, validation_y))

# print("FINISH")

# scores
        

In [16]:
# scores_df = pd.DataFrame.from_dict(scores)
# scores_df.groupby("max_depth").mean()

### Question 4 ANSWER - 20 	105.0 	0.054687

## Question 5 - Get feature importance from tree based model

In [17]:
regressor = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1
)

regressor.fit(validation_X, validation_y)

for feature, score in zip(train_df.columns, regressor.feature_importances_):
    print(f"feature - {feature}, score - {score}")

feature - longitude, score - 0.018247962485030033
feature - latitude, score - 0.03674406551708488
feature - housing_median_age, score - 0.0858280474545381
feature - total_rooms, score - 0.08143999199402668
feature - total_bedrooms, score - 0.353855148431285
feature - population, score - 0.14363069314725915
feature - households, score - 0.21208114154097396
feature - median_income, score - 0.03110582273820377
feature - median_house_value, score - 0.01650429629115556
feature - ocean_proximity, score - 0.020562830400442874


### Question 5 ANSWER - total_bedrooms, score - 0.353855148431285

## Question 6 - Get feature importance from tree based model

In [34]:
import xgboost as xgb

features = dv.get_feature_names_out()
list(features)
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgvalid = xgb.DMatrix(validation_X, label=validation_y)

In [35]:
watchlist = [(xgtrain, "train"), (xgvalid, "vaild")]

In [42]:
%%capture output

xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

print("START")

model = xgb.train(xgb_params, xgtrain, num_boost_round=100, evals=watchlist)

print("FINISH")

In [44]:
model

<xgboost.core.Booster at 0x7fa8041e4250>

### Question 5 ANSWER - best RMSE for eta - 0.1