In [3]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
training_df = pd.read_csv("data/train.csv")

In [19]:
def evaluate_model(model, test_data, test_labels):
    y_true = test_labels
    y_pred = model.predict(test_data)
    mse = mean_squared_error(y_true, y_pred)
    return {
        "model": str(model),
        "rmse": round(math.sqrt(mse), 0),
        "mean_squared_error": round(mse, 0),
        "r2_score": 100000*r2_score(y_true, y_pred) // 100 / 10,
        "feature_importances": model.feature_importances_ if "feature_importances_" in dir(model) else None
    }

In [35]:
y_train = training_df["SalePrice"]

In [36]:
def train_and_evaluate_model(model, X_train):
    model.fit(X_train, y_train)
    print(evaluate_model(model, X_train, y_train))

In [37]:
def train_and_evaluate_models(models, X_train):
    for model in models:
        train_and_evaluate_model(model, X_train)

In [38]:
def calculate_features_and_train_and_evaluate_models(calculate_features, models):
    X_train = calculate_features(training_df)
    train_and_evaluate_models(models, X_train)

In [32]:
BASIC_MODELS = [
    linear_model.LinearRegression(),
    RandomForestRegressor(),
    GradientBoostingRegressor(),
]

# Very simple models

In [5]:
def calculate_features1(df: pd.DataFrame) -> pd.DataFrame:
    return df[["OverallQual", "OverallCond"]]

In [6]:
X_train = calculate_features1(training_df)
y_train = training_df["SalePrice"]

In [18]:
train_and_evaluate_model(linear_model.LinearRegression(), X_train, y_train)

{'model': 'LinearRegression()', 'rmse': 48588.0, 'mean_squared_error': 2360766387.0, 'r2_score': 62.5}


In [20]:
train_and_evaluate_model(RandomForestRegressor(), X_train, y_train)

{'model': 'RandomForestRegressor()', 'rmse': 43331.0, 'mean_squared_error': 1877589833.0, 'r2_score': 70.2, 'feature_importances': array([0.96249835, 0.03750165])}


In [21]:
train_and_evaluate_model(GradientBoostingRegressor(), X_train, y_train)

{'model': 'GradientBoostingRegressor()', 'rmse': 43286.0, 'mean_squared_error': 1873636058.0, 'r2_score': 70.2, 'feature_importances': array([0.9716332, 0.0283668])}


# Add more features

In [28]:
def calculate_features2(df: pd.DataFrame) -> pd.DataFrame:
    return df[[
        "LotArea",
        "OverallQual",
        "YearBuilt",
        "GrLivArea",
        "GarageCars",
        "OpenPorchSF",
    ]]

In [39]:
calculate_features_and_train_and_evaluate_models(calculate_features2, BASIC_MODELS)

{'model': 'LinearRegression()', 'rmse': 38869.0, 'mean_squared_error': 1510781144.0, 'r2_score': 76.0, 'feature_importances': None}
{'model': 'RandomForestRegressor()', 'rmse': 12797.0, 'mean_squared_error': 163756390.0, 'r2_score': 97.4, 'feature_importances': array([0.06188405, 0.61766763, 0.06969749, 0.18995948, 0.03429265,
       0.0264987 ])}
{'model': 'GradientBoostingRegressor()', 'rmse': 22904.0, 'mean_squared_error': 524590089.0, 'r2_score': 91.6, 'feature_importances': array([0.0451137 , 0.6132553 , 0.07384152, 0.19016209, 0.06148466,
       0.01614273])}


# Check all features

In [50]:
def calculate_features3(df: pd.DataFrame) -> pd.DataFrame:
    return pd.get_dummies(df.drop(columns=[
        'SalePrice',
        'Id',

        # TODO - columns with missing values:
        'LotFrontage',
        'Alley',
        'BsmtQual',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'FireplaceQu',
        'GarageType',
        'LotFrontage',
        'GarageYrBlt',
        'GarageFinish',
        'GarageQual',
        'GarageCond',
        'PoolQC',
        'Fence',
        'MiscFeature',
]))

In [51]:
calculate_features_and_train_and_evaluate_models(calculate_features3, [RandomForestRegressor()])

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').