In [42]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
from sklearn.preprocessing import QuantileTransformer, PowerTransformer

from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.linear_model import LinearRegression, TweedieRegressor, LassoLars

from importlib import reload

import warnings
warnings.filterwarnings("ignore")

import wrangle as wr
sns.set(rc={'figure.facecolor':'fbf3e4','axes.facecolor':'fbf3e4'})
#sns.set(font_scale=1.5)
pd.options.display.float_format = '{:,.3f}'.format

In [23]:
from sklearn.metrics import explained_variance_score

In [24]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [25]:
from sklearn.feature_selection import SelectKBest, RFE, f_regression, SequentialFeatureSelector

Evaluation:
- as an absolute score - RMSE
- as a percentage - Adjasted R2

Linear Regressions

### Functions

In [26]:
def regression_errors(y_actual, y_predicted):
    '''
    this function accepts 
    y: actual results/array
    yhat: predictions/array
    k: feature size/integer
    calculates regression scores based on the baseline being median
    returns RMSE and adjacted R2
    '''
    # root mean squared error score
    RMSE = mean_squared_error(y_actual, y_predicted) ** .5
    # adjucted R^2 score
    ADJR2 = explained_variance_score(y_actual, y_predicted)
    return round(RMSE), round(ADJR2, 2)

In [27]:
def scale_zillow_quantile(train, validate, test):
    '''
    accepts train, validate, test data sets
    scales the data in each of them
    returns transformed data sets
    '''
    #count_columns = ['bedroomcnt', 'bathroomcnt']
    
    #col = train.columns[1:-1]
    col = ['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']
    
    # create scalers
    #min_max_scaler = MinMaxScaler()    
    qt = QuantileTransformer(output_distribution='normal')
    qt.fit(train[col])
    train[col] = qt.transform(train[col])
    validate[col] = qt.transform(validate[col])
    test[col] = qt.transform(test[col])
    
    return train, validate, test

In [28]:
def select_kbest(X, y, k):
    '''
    the function accepts the X_train data set, y_train array and k-number of features to select
    runs the SelectKBest algorithm and returns the list of features to be selected for the modeling
    !KBest doesn't depend on the model
    '''
    kbest = SelectKBest(f_regression, k=k)
    kbest.fit(X, y)
    return X.columns[kbest.get_support()].tolist()

In [29]:
def standard_scale_zillow(train, validate, test):
    '''
    accepts train, validate, test data sets
    scales the data in each of them
    returns transformed data sets
    '''

    col = ['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']
    
    # create scalers
    scaler = StandardScaler()    
    #qt = QuantileTransformer(output_distribution='normal')
    scaler.fit(train[col])
    train[col] = scaler.transform(train[col])
    validate[col] = scaler.transform(validate[col])
    test[col] = scaler.transform(test[col])
    
    return train, validate, test

In [30]:
def run_model(X_train, X_validate, scaling):
    
    '''
    general function to run models with X_train and X_validate that were scaled
    '''

    for f in features:
        for key in models:
            # create a model
            model = models[key]
            # fit the model
            model.fit(X_train[features[f]], y_train)
            # predictions of the train set
            y_hat_train = model.predict(X_train[features[f]])
            # predictions of the validate set
            y_hat_validate = model.predict(X_validate[features[f]])
            # add train set predictions to the data frame
            predictions_train[key] = y_hat_train
            # add validate set predictions to the data frame
            predictions_validate[key] = y_hat_validate

            # calculate scores train set
            RMSE, R2 = regression_errors(y_train, y_hat_train)
            # calculate scores validation set
            RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
            diff = np.abs(RMSE - RMSE_val)
            
            # add the score results to the scores Data Frame
            scores.loc[len(scores.index)] = [key, f, scaling, RMSE, R2, RMSE_val, R2_val, diff]

In [31]:
def run_model_standard():
    # runs regression models on the X_train scaled with StandardScaler()
    X1, X2, _ = wr.standard_scale_zillow(X_train, X_validate, X_test)
    run_model(X1, X2, 'standard')

In [32]:
def run_model_quantile():
    XQ1, XQ2, _ = wr.scale_zillow_quantile(X_train, X_validate, X_test)
    run_model(XQ1, XQ2, 'quantile')

In [83]:
def run_rfe():
    '''
    The function accepts the X_train data set, y_train array and k-number of features to select
    runs the RFE algorithm and returns the list of features to be selected for the modeling
    !RFE depends on the model.
    This function uses Linear regression
    '''
    # scale the data
    X1, X2, _ = wr.standard_scale_zillow(X_train, X_validate, X_test)
    
    for key in models:
        # create a model
        model = models[key]
        
        # create a RFE feature selector
        rfe = RFE(model, n_features_to_select=4)
        rfe.fit(X1, y_train)
        
        # get the optimal features for every particular model
        f = X1.columns[rfe.get_support()].tolist()
        
        # fit the model with RFE features
        model.fit(X1[f], y_train)
        # predictions of the train set
        y_hat_train = model.predict(X1[f])
        # predictions of the validate set
        y_hat_validate = model.predict(X2[f])
        # add train set predictions to the data frame
        col_name = str(key)+'_rfe'
        predictions_train[col_name] = y_hat_train
        # add validate set predictions to the data frame
        predictions_validate[col_name] = y_hat_validate

        # calculate scores train set
        RMSE, R2 = regression_errors(y_train, y_hat_train)
        # calculate scores validation set
        RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
        diff = np.abs(RMSE - RMSE_val)

        # add the score results to the scores Data Frame
        scores.loc[len(scores.index)] = [key, 'rfe', 'standard', RMSE, R2, RMSE_val, R2_val, diff]

In [84]:
def run_polynomial():
    # scale the data
    X1, X2, _ = wr.standard_scale_zillow(X_train, X_validate, X_test)
    
    # only bedroom / bathroom polynomial
    for f in features:
        # features[f] gives an access to the list of features in the dictionary
        
        # create a Polynomial feature transformer
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        poly.fit(X1[features[f]])
        # create a df with transformed features of the train set
        X1_poly = pd.DataFrame(
            poly.transform(X1[features[f]]),
            columns=poly.get_feature_names(X1[features[f]].columns),
            index=X1.index)
        X1_poly = pd.concat([X1_poly, X1.iloc[:, 2:]], axis=1)
        # create a df with transformed features for the validate set
        X2_poly = pd.DataFrame(
            poly.transform(X2[features[f]]),
            columns=poly.get_feature_names(X2[features[f]].columns),
            index=X2.index)
        X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)
        
        feature_name = str(f)+'_poly'
        
        for key in models:
            # create a model
            model = models[key]
            # fit the model
            model.fit(X1_poly, y_train)
            # predictions of the train set
            y_hat_train = model.predict(X1_poly)
            # predictions of the validate set
            y_hat_validate = model.predict(X2_poly)
            # add train set predictions to the data frame
            predictions_train[key] = y_hat_train
            # add validate set predictions to the data frame
            predictions_validate[key] = y_hat_validate

            # calculate scores train set
            RMSE, R2 = regression_errors(y_train, y_hat_train)
            # calculate scores validation set
            RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
            diff = np.abs(RMSE - RMSE_val)
            
            # add the score results to the scores Data Frame
            scores.loc[len(scores.index)] = [key, feature_name, 'standard', RMSE, R2, RMSE_val, R2_val, diff]

In [77]:
def run_single():
    for f in single_corr:
        model = LinearRegression()
        model.fit(X1[[f]], y_train)
        # predictions of the train set
        y_hat_train = model.predict(X1[[f]])
        # predictions of the validate set
        y_hat_validate = model.predict(X2[[f]])
        # add train set predictions to the data frame
        predictions_train[key] = y_hat_train
        # add validate set predictions to the data frame
        predictions_validate[key] = y_hat_validate

        # calculate scores train set
        RMSE, R2 = regression_errors(y_train, y_hat_train)
        # calculate scores validation set
        RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
        diff = np.abs(RMSE - RMSE_val)

        # add the score results to the scores Data Frame
        scores.loc[len(scores.index)] = ['Single Linear Regression', f, 'standard', RMSE, R2, RMSE_val, R2_val, diff]

In [102]:
reload(wr)

<module 'wrangle' from '/Users/nadina/codeup-data-science/zillow-project/wrangle.py'>

In [34]:
# get the zillow data
df = wr.get_zillow()

wr.dummies(df)
X_train, X_validate, X_test, y_train, y_validate, y_test = wr.full_split_zillow(df)

In [35]:

# create 3 data sets that keep the values of the counties
la = df[(df.Orange == 0) & (df.Ventura == 0)] # LA county
ventura = df[df.Ventura == 1] # Ventura county
orange = df[df.Orange == 1] # Orange county

In [36]:
baseline = y_train.median()
baseline

362435.0

In [37]:
predictions_train = pd.DataFrame(y_train)
predictions_validate = pd.DataFrame(y_validate)
predictions_train['baseline'] = baseline
predictions_validate['baseline'] = baseline

In [38]:
X1, X2, X3 = wr.standard_scale_zillow(X_train, X_validate, X_test)

In [39]:
XQ1, XQ2, XQ3 = scale_zillow_quantile(X_train, X_validate, X_test)

In [40]:
seed = 2912

In [80]:
models = {
    'Linear Regression': LinearRegression(),
    'Generalized Linear Model': TweedieRegressor(power=2, alpha = 0.5),
    'Gradient Boosting Regression': GradientBoostingRegressor(random_state=seed),
    'Decision Tree Regression': DecisionTreeRegressor(max_depth=4, random_state=seed),
    'Random Forest Regression':RandomForestRegressor(max_depth=4, random_state=seed),
    'LassoLars Regression':LassoLars(alpha=0.1)
    }

In [81]:
for key in models:
    print(models[key])

LinearRegression()
TweedieRegressor(alpha=0.5, power=2)
GradientBoostingRegressor(random_state=2912)
DecisionTreeRegressor(max_depth=4, random_state=2912)
RandomForestRegressor(max_depth=4, random_state=2912)
LassoLars(alpha=0.1)


In [87]:
scores = pd.DataFrame(columns=['model_name', 'features', 'scaling',
                               'RMSE_train', 'R2_train', 'RMSE_validate', 'R2_validate', 'RMSE_difference'])

In [46]:
# select features with K-Best algorithm 
select_kbest(X_train, y_train, 3)

['bedrooms', 'bathrooms', 'sq_feet']

In [47]:
f1 = ['bedrooms', 'bathrooms', 'sq_feet']
f2 = ['bedrooms', 'bathrooms']
f3 = ['bedrooms','bathrooms','sq_feet', 'pools']
f4 = ['bathrooms','sq_feet', 'pools']
f5 = ['bedrooms','bathrooms','sq_feet','house_age','pools','Orange','Ventura']
f6 = select_kbest(X_train, y_train, 4)
f7 = X_train.columns.tolist()

# create a dictionary with features
features = {
    'f1':f1,
    'f2':f2,
    'f3':f3,
    'f4':f4,
    'f5':f5,
    'f6':f6,
    'f7':f7
}

In [48]:
run_model_standard()

In [49]:
scores.sort_values(by='RMSE_train')

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
38,Gradient Boosting Regression,f7,standard,269976,0.41,269407,0.4,569
26,Gradient Boosting Regression,f5,standard,271862,0.4,270708,0.39,1154
32,Gradient Boosting Regression,f6,standard,277936,0.38,276324,0.37,1612
14,Gradient Boosting Regression,f3,standard,281581,0.36,280589,0.35,992
2,Gradient Boosting Regression,f1,standard,282210,0.36,281158,0.35,1052
40,Random Forest Regression,f7,standard,282946,0.35,279782,0.35,3164
28,Random Forest Regression,f5,standard,284177,0.35,281244,0.35,2933
34,Random Forest Regression,f6,standard,284685,0.35,281324,0.35,3361
20,Gradient Boosting Regression,f4,standard,285209,0.34,283665,0.33,1544
39,Decision Tree Regression,f7,standard,285674,0.34,282026,0.34,3648


In [50]:
run_model_quantile()

In [52]:
scores.sort_values(by='RMSE_train').head(20)

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
38,Gradient Boosting Regression,f7,standard,269976,0.41,269407,0.4,569
80,Gradient Boosting Regression,f7,quantile,269976,0.41,269407,0.4,569
26,Gradient Boosting Regression,f5,standard,271862,0.4,270708,0.39,1154
68,Gradient Boosting Regression,f5,quantile,271862,0.4,270709,0.39,1153
32,Gradient Boosting Regression,f6,standard,277936,0.38,276324,0.37,1612
74,Gradient Boosting Regression,f6,quantile,277936,0.38,276321,0.37,1615
14,Gradient Boosting Regression,f3,standard,281581,0.36,280589,0.35,992
56,Gradient Boosting Regression,f3,quantile,281581,0.36,280589,0.35,992
44,Gradient Boosting Regression,f1,quantile,282210,0.36,281158,0.35,1052
2,Gradient Boosting Regression,f1,standard,282210,0.36,281158,0.35,1052


In [89]:
X_train.columns.tolist()[:-3]

['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']

In [53]:
rfe()

In [54]:
scores[scores.features == 'rfe']

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
84,Linear Regression,rfe,standard,297363,0.29,292245,0.29,5118
85,Generalized Linear Model,rfe,standard,300300,0.27,294633,0.28,5667
86,Gradient Boosting Regression,rfe,standard,276467,0.38,274858,0.38,1609
87,Decision Tree Regression,rfe,standard,285684,0.34,281886,0.34,3798
88,Random Forest Regression,rfe,standard,283188,0.35,279839,0.35,3349
89,LassoLars Regression,rfe,standard,297363,0.29,292244,0.29,5119


In [139]:
polynomial()

In [144]:
scores.sort_values(by='RMSE_train')

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
38,Gradient Booster,poly,standard,267275,0.42,268978,0.4,1703
26,Gradient Booster,poly,standard,268026,0.42,269271,0.4,1245
32,Gradient Booster,poly,standard,269318,0.42,270113,0.4,795
14,Gradient Booster,poly,standard,269612,0.41,270309,0.4,697
2,Gradient Booster,poly,standard,269874,0.41,270087,0.4,213
8,Gradient Booster,poly,standard,270423,0.41,270071,0.4,352
20,Gradient Booster,poly,standard,271260,0.41,270856,0.39,404
36,Linear Regression,poly,standard,277834,0.38,273213,0.38,4621
41,LassoLars,poly,standard,278870,0.37,273891,0.38,4979
24,Linear Regression,poly,standard,278981,0.37,274127,0.38,4854


In [57]:
scores[scores.model_name == 'Generalized Linear Model']

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
1,Generalized Linear Model,f1,standard,301060,0.27,295396,0.28,5664
7,Generalized Linear Model,f2,standard,329812,0.12,324689,0.13,5123
13,Generalized Linear Model,f3,standard,300335,0.27,294626,0.28,5709
19,Generalized Linear Model,f4,standard,301021,0.27,295307,0.28,5714
25,Generalized Linear Model,f5,standard,299538,0.28,293900,0.29,5638
31,Generalized Linear Model,f6,standard,301062,0.27,295396,0.28,5666
37,Generalized Linear Model,f7,standard,299409,0.28,293760,0.29,5649
43,Generalized Linear Model,f1,quantile,301360,0.27,295695,0.28,5665
49,Generalized Linear Model,f2,quantile,327920,0.13,322781,0.14,5139
55,Generalized Linear Model,f3,quantile,300634,0.27,294930,0.28,5704


In [64]:
mean_squared_error(y_train, yhat_train) ** .5, explained_variance_score(y_train, yhat_train)

(283143.73749297264, 0.35151216724266465)

In [69]:
regression_errors(y_train, yhat_train)

(283144, 0.35)

In [57]:
regression_errors(y_train, predictions_train.baseline, 3)

(351606, 0.05)

In [67]:
single_corr = X1.iloc[:, :-3].columns.tolist()

In [68]:
single_corr

['bedrooms', 'bathrooms', 'sq_feet', 'lot_sqft', 'house_age']

In [78]:
run_single()

In [79]:
scores[scores.model_name == 'Single Linear Regression']

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
90,Single Linear Regression,bedrooms,standard,342329,0.06,336890,0.06,5439
91,Single Linear Regression,bathrooms,standard,331662,0.11,327084,0.12,4578
92,Single Linear Regression,sq_feet,standard,301680,0.27,296552,0.27,5128
93,Single Linear Regression,lot_sqft,standard,348182,0.02,343313,0.03,4869
94,Single Linear Regression,house_age,standard,342565,0.05,337964,0.06,4601


In [85]:
def run_all_models():
    run_model_standard()
    run_model_quantile()
    run_rfe()
    run_polynomial()
    run_single()

In [88]:
scores

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference


In [89]:
run_all_models()

In [90]:
scores.shape

(137, 8)

In [215]:
def select_best_model(scores):
    # select top 20 models based on the RMSE score of the train set
    top_20 = scores.sort_values(by='RMSE_train').head(20)
    # select top 5 models based on the RMSE score of the validate set
    top_5 = top_20.sort_values(by=['RMSE_validate']).head(5)
    # display top 5 models
    display(top_5)
    # select the best model with the smallest difference in the RMSE scores
    best_model = top_5.sort_values(by='RMSE_difference').head(1)
    return best_model

In [217]:
best = select_best_model(scores)

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
128,Gradient Boosting Regression,f7_poly,standard,267275,0.42,268978,0.4,1703
116,Gradient Boosting Regression,f5_poly,standard,268026,0.42,269271,0.4,1245
80,Gradient Boosting Regression,f7,quantile,269976,0.41,269407,0.4,569
38,Gradient Boosting Regression,f7,standard,269976,0.41,269407,0.4,569
98,Gradient Boosting Regression,f2_poly,standard,270423,0.41,270071,0.4,352


In [218]:
best

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
98,Gradient Boosting Regression,f2_poly,standard,270423,0.41,270071,0.4,352


In [219]:
top_20 = scores.sort_values(by='RMSE_train').head(20)

In [213]:
top_5 = top_20.sort_values(by=['RMSE_validate']).head(5)

In [214]:
top_5

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
128,Gradient Boosting Regression,f7_poly,standard,267275,0.42,268978,0.4,1703
116,Gradient Boosting Regression,f5_poly,standard,268026,0.42,269271,0.4,1245
80,Gradient Boosting Regression,f7,quantile,269976,0.41,269407,0.4,569
38,Gradient Boosting Regression,f7,standard,269976,0.41,269407,0.4,569
98,Gradient Boosting Regression,f2_poly,standard,270423,0.41,270071,0.4,352


In [151]:
best_model = top_5.sort_values(by='RMSE_difference').head(1)

In [152]:
best_model

Unnamed: 0,model_name,features,scaling,RMSE_train,R2_train,RMSE_validate,R2_validate,RMSE_difference
98,Gradient Boosting Regression,f2_poly,standard,270423,0.41,270071,0.4,352


In [145]:
str(f2)

"['bedrooms', 'bathrooms']"

In [221]:
def run_best_model():
    '''
    the function runs the best model on the train, test and validate data sets 
    and returns scores in the data frame
    '''
    # create a data frame for test set results
    predictions_test = pd.DataFrame(y_test)
    predictions_test['baseline'] = baseline

    f = f2
    poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    poly.fit(X1[f])

    # create a df with transformed features of the train set
    X1_poly = pd.DataFrame(
                poly.transform(X1[f]),
                columns=poly.get_feature_names(X1[f].columns),
                index=X1.index)
    X1_poly = pd.concat([X1_poly, X1.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create a df with transformed features for the validate set
    X2_poly = pd.DataFrame(
                poly.transform(X2[f]),
                columns=poly.get_feature_names(X2[f].columns),
                index=X2.index)
    X2_poly = pd.concat([X2_poly, X2.iloc[:, 2:]], axis=1)

    # create. df with transformed features for the test set
    X3_poly = pd.DataFrame(
                poly.transform(X3[f]),
                columns=poly.get_feature_names(X3[f].columns),
                index=X3.index)
    X3_poly = pd.concat([X3_poly, X3.iloc[:, 2:]], axis=1)

    # create a Gradient Boosting Regression model
    model = GradientBoostingRegressor()
    # fit the model
    model.fit(X1_poly, y_train)
    # predictions of the train set
    y_hat_train = model.predict(X1_poly)
    # predictions of the validate set
    y_hat_validate = model.predict(X2_poly)
    # add train set predictions to the data frame
    y_hat_test = model.predict(X3_poly)
    predictions_test['predictions'] = y_hat_test

    # calculate scores train set
    RMSE_train, R2_train = regression_errors(y_train, y_hat_train)
    # calculate scores validation set
    RMSE_val, R2_val = regression_errors(y_validate, y_hat_validate)
    # calculate scores test set
    RMSE_test, R2_test = regression_errors(y_test, y_hat_test)
    RMSE_bl, _ = regression_errors(y_test, predictions_test.baseline)
    
    # save final score into a dictionary
    res = {
        'Features': str(f),
        'RMSE Train Set': RMSE_train,
        'RMSE Validation Set':RMSE_val,
        'RMSE Test Set':RMSE_test,
        'R2 Train Set':R2_train,
        'R2 Validation Set':R2_val,
        'R2 Test':R2_test,
        'Beats a basline by:':str(f'{round((RMSE_bl - RMSE_test) / RMSE_bl * 100, 1)}%')
    }

    # add the score results to the scores Data Frame
    final_test = pd.DataFrame({'Gradient Bosting Regression': list(res.keys()), 'Scores': list(res.values())})

    return final_test

In [222]:
run_best_model()

Unnamed: 0,Gradient Bosting Regression,Scores
0,Features,"['bedrooms', 'bathrooms']"
1,RMSE Train Set,270423
2,RMSE Validation Set,270075
3,RMSE Test Set,273441
4,R2 Train Set,0.410
5,R2 Validation Set,0.400
6,R2 Test,0.380
7,Beats a basline by:,23.5%


In [164]:
RMSE_test

273443

In [168]:
RMSE_bl

357599

In [170]:
RMSE_bl

0.7646637714311282

In [None]:
R2_test

In [166]:
RMSE_val - RMSE_test

-3366

In [169]:
(RMSE_bl - RMSE_test)

84156

In [173]:
round((RMSE_bl - RMSE_test) / RMSE_bl * 100, 1)

23.5

In [153]:
RMSE_bl/RMSE_test

1.3077738890656485

In [209]:
# add the score results to the scores Data Frame
final_test = pd.DataFrame({'Gradient Bosting Regression': list(res.keys()), 'Scores': list(res.values())})

In [210]:
final_test

Unnamed: 0,Gradient Bosting Regression,Scores
0,Features,"['bedrooms', 'bathrooms']"
1,RMSE Train Set,270423
2,RMSE Validation Set,270077
3,RMSE Test Set,273443
4,R2 Train Set,0.410
5,R2 Validation Set,0.400
6,R2 Test,0.380
7,Beats a basline by:,23.5%


In [208]:
res = {
    'Features': str(f),
    'RMSE Train Set': RMSE_train,
    'RMSE Validation Set':RMSE_val,
    'RMSE Test Set':RMSE_test,
    'R2 Train Set':R2_train,
    'R2 Validation Set':R2_val,
    'R2 Test':R2_test,
    'Beats a basline by:':str(f'{round((RMSE_bl - RMSE_test) / RMSE_bl * 100, 1)}%')
}

In [111]:
predictions_test

Unnamed: 0,home_value,baseline
43192,1292346,362435.000
715,677817,362435.000
35134,63565,362435.000
25346,1522890,362435.000
33799,319803,362435.000
...,...,...
5434,556697,362435.000
43165,348165,362435.000
7890,912304,362435.000
23816,262387,362435.000
