In [4]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_validate
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
training_df = pd.read_csv("data/train.csv")

In [19]:
def evaluate_model(model, test_data, test_labels):
    y_true = test_labels
    y_pred = model.predict(test_data)
    mse = mean_squared_error(y_true, y_pred)
    return {
        "model": str(model),
        "rmse": round(math.sqrt(mse), 0),
        "mean_squared_error": round(mse, 0),
        "r2_score": 100000*r2_score(y_true, y_pred) // 100 / 10,
        "feature_importances": model.feature_importances_ if "feature_importances_" in dir(model) else None
    }

In [35]:
y_train = training_df["SalePrice"]

In [22]:
def train_and_evaluate_model(model, X_train):
    #model.fit(X_train, y_train)
    #print(evaluate_model(model, X_train, y_train))
    print(model)
    scores = cross_validate(model, X_train, y_train, return_train_score=True, scoring=['r2', 'neg_mean_squared_error', 'neg_root_mean_squared_error'])
    for key, values in scores.items():
        if key not in ['fit_time', 'score_time']:
            values = [round(v, 2) for v in values]
        print(key, values)

In [26]:
def train_and_evaluate_models(models, X_train):
    for model in models:
        train_and_evaluate_model(model, X_train)
        print("\n")

In [27]:
def calculate_features_and_train_and_evaluate_models(calculate_features, models):
    X_train = calculate_features(training_df)
    train_and_evaluate_models(models, X_train)

In [28]:
BASIC_MODELS = [
    linear_model.LinearRegression(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
]

# Very simple models

In [9]:
def calculate_features1(df: pd.DataFrame) -> pd.DataFrame:
    return df[["OverallQual", "OverallCond"]]

In [10]:
X_train = calculate_features1(training_df)
y_train = training_df["SalePrice"]

In [23]:
train_and_evaluate_model(linear_model.LinearRegression(), X_train)

LinearRegression()
fit_time [0.00397825 0.00298905 0.00199389 0.00299144 0.00199652]
score_time [0.00498748 0.00199771 0.00399113 0.00199461 0.00199485]
test_r2 [0.66, 0.64, 0.6, 0.61, 0.61]
train_r2 [0.62, 0.62, 0.63, 0.63, 0.63]
test_neg_mean_squared_error [-1869947388.22, -2371552540.23, -3045971941.0, -1967592757.16, -2638370572.3]
train_neg_mean_squared_error [-2483581473.94, -2362056918.3, -2193403000.48, -2462508420.21, -2292364532.38]
test_neg_root_mean_squared_error [-43242.89, -48698.59, -55190.32, -44357.56, -51365.07]
train_neg_root_mean_squared_error [-49835.54, -48601.0, -46833.78, -49623.67, -47878.64]


In [24]:
train_and_evaluate_model(RandomForestRegressor(), X_train)

RandomForestRegressor()
fit_time [0.14564228 0.13561916 0.1266923  0.1217072  0.123667  ]
score_time [0.00997353 0.00897598 0.00897503 0.0079782  0.00897622]
test_r2 [0.71, 0.66, 0.66, 0.66, 0.64]
train_r2 [0.7, 0.71, 0.7, 0.71, 0.71]
test_neg_mean_squared_error [-1623357633.63, -2264181229.37, -2611108465.7, -1726407137.26, -2401094967.63]
train_neg_mean_squared_error [-1945489613.19, -1837406234.09, -1803024389.42, -1937881628.73, -1774043339.46]
test_neg_root_mean_squared_error [-40290.91, -47583.41, -51099.01, -41550.06, -49000.97]
train_neg_root_mean_squared_error [-44107.7, -42864.98, -42462.03, -44021.38, -42119.39]


In [25]:
train_and_evaluate_model(GradientBoostingRegressor(), X_train)

GradientBoostingRegressor()
fit_time [0.07580781 0.04191327 0.0389266  0.04384899 0.04587436]
score_time [0.00398779 0.00199223 0.00299501 0.00399017 0.00299978]
test_r2 [0.7, 0.68, 0.65, 0.66, 0.64]
train_r2 [0.7, 0.71, 0.7, 0.71, 0.71]
test_neg_mean_squared_error [-1675516371.89, -2133446985.98, -2638065279.51, -1730202108.05, -2401596672.73]
train_neg_mean_squared_error [-1937310081.36, -1832714195.1, -1812390909.36, -1928879451.06, -1769197043.77]
test_neg_root_mean_squared_error [-40933.07, -46189.25, -51362.1, -41595.7, -49006.09]
train_neg_root_mean_squared_error [-44014.88, -42810.21, -42572.18, -43919.01, -42061.82]


# Add more features

In [29]:
def calculate_features2(df: pd.DataFrame) -> pd.DataFrame:
    return df[[
        "LotArea",
        "OverallQual",
        "YearBuilt",
        "GrLivArea",
        "GarageCars",
        "OpenPorchSF",
    ]]

In [30]:
calculate_features_and_train_and_evaluate_models(calculate_features2, BASIC_MODELS)

LinearRegression()
fit_time [0.0069809  0.00398088 0.00598478 0.00299144 0.00199389]
score_time [0.00698066 0.00298858 0.00501323 0.00299215 0.00199413]
test_r2 [0.8, 0.77, 0.76, 0.75, 0.69]
train_r2 [0.75, 0.76, 0.76, 0.76, 0.78]
test_neg_mean_squared_error [-1081535146.52, -1516818236.42, -1799936654.85, -1257777686.06, -2094910913.4]
train_neg_mean_squared_error [-1618704024.54, -1515621866.63, -1450586247.02, -1575816882.85, -1372433080.54]
test_neg_root_mean_squared_error [-32886.7, -38946.35, -42425.66, -35465.16, -45770.2]
train_neg_root_mean_squared_error [-40233.12, -38930.99, -38086.56, -39696.56, -37046.36]


RandomForestRegressor()
fit_time [0.43087959 0.40591335 0.39372253 0.413692   0.36888576]
score_time [0.01193523 0.01895118 0.01394153 0.01396728 0.0119946 ]
test_r2 [0.84, 0.79, 0.85, 0.85, 0.79]
train_r2 [0.98, 0.98, 0.97, 0.97, 0.97]
test_neg_mean_squared_error [-863694211.82, -1392514966.74, -1154121163.03, -772178720.97, -1394579839.71]
train_neg_mean_squared_error

# Check all features

In [50]:
def calculate_features3(df: pd.DataFrame) -> pd.DataFrame:
    return pd.get_dummies(df.drop(columns=[
        'SalePrice',
        'Id',

        # TODO - columns with missing values:
        'LotFrontage',
        'Alley',
        'BsmtQual',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'FireplaceQu',
        'GarageType',
        'LotFrontage',
        'GarageYrBlt',
        'GarageFinish',
        'GarageQual',
        'GarageCond',
        'PoolQC',
        'Fence',
        'MiscFeature',
]))

In [51]:
calculate_features_and_train_and_evaluate_models(calculate_features3, [RandomForestRegressor()])

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').