In [10]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Dataset

In [98]:
train_df = pd.read_csv('data/train.csv')

test_df = pd.read_csv('data/test.csv')


In [64]:
def print_scores(y_test, y_pred):
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# Cleaning NaNs and Standardizing values

In [97]:
from sklearn.preprocessing import MinMaxScaler
def get_categorical_numerical(ddf):
    df = ddf.copy()
    g = df.columns.to_series().groupby(df.dtypes).groups
    types = {k.name: v for k, v in g.items()}
    categorical = types['object'].values
    types.pop('object')
    numerical = [val.values for key, val in types.items()]
    numerical = [elem for sublist in numerical for elem in sublist]
    return categorical, numerical

def clean_df(ddf):
    df = ddf.copy()
    df = df.dropna(axis = 1, thresh= df.shape[0] // 4)
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                     'percent_missing': percent_missing})
    missing_value_df = missing_value_df[missing_value_df.percent_missing > 0]
    df[missing_value_df.column_name.values].describe()
    df.MasVnrArea.fillna(method='ffill', inplace=True)
    df.MasVnrArea.fillna(method='bfill', inplace=True)
    df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
    df.GarageYrBlt.fillna(df.GarageYrBlt.median(), inplace=True)
    round(df[missing_value_df.column_name.values].isnull().sum() / len(df), 3)
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
    categorical, numerical = get_categorical_numerical(df)
    numeric_features = df.loc[:,numerical[numerical != 'SalePrice']]
    numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
    df.update(numeric_features_standardized)
    for column in categorical:
        one_hot = pd.get_dummies(df[column])
        df = df.drop(column,axis = 1)
        df = df.join(one_hot, lsuffix='_left', rsuffix='_right')
    return df

In [99]:
test_df = clean_df(test_df)
train_df = clean_df(train_df)

  return merge(
  return merge(
  return merge(
  return merge(
  return merge(
  return merge(
  return merge(
  return merge(
  return merge(
  return merge(
  return merge(


In [94]:
X = train_df.copy()
print(train_df.columns)
X.drop('SalePrice', axis = 1)
X.drop('Id', axis = 1)
X = X.values
y = train_df['SalePrice'].values    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'ConLw', 'New', 'Oth', 'WD', 'Abnorml', 'AdjLand', 'Alloca', 'Family',
       'Normal', 'Partial'],
      dtype='object', length=277)


# Sklearn

GridSearch

In [36]:
parameters = {'criterion':('squared_error', 'absolute_error', 'poisson'), 'max_depth': (100,500, 1000, None),
              'max_features':('auto', 'sqrt', 'log2')}

model = RandomForestRegressor()
rgr = GridSearchCV(model, parameters)
rgr.fit(X_train[:300], y_train[:300])
print(f"best params: {rgr.best_params_}")

best params: {'criterion': 'squared_error', 'max_depth': 1000, 'max_features': 'auto'}


best params = default params

Training

In [100]:
rgr = RandomForestRegressor()
rgr.fit(X_train, y_train)
rgr.score(X_test, y_test)
y_pred = rgr.predict(X_test)
print_scores(y_test, y_pred)

Mean Absolute Error: 0.003091636275975631
Mean Squared Error: 0.00027628940567323403
Root Mean Squared Error: 0.016621955530960672


# XGBoost

In [134]:
import xgboost as xgb
parameters = {'tree_method':['gpu_hist'], 'colsample_bytree':(0.2, 0.4), 'alpha': (0.5, 0.9),
              'reg_lambda':(0.3, 0.6, 0.9)}
xgbr = xgb.XGBRegressor()
rgr = GridSearchCV(xgbr, parameters)

rgr.fit(X_train, y_train)
print(f"best params: {rgr.best_params_}")

best params: {'alpha': 0.5, 'colsample_bytree': 0.4, 'reg_lambda': 0.3, 'tree_method': 'gpu_hist'}


max_depth = 4, n_estimators = 14400, subsample = 0.2, eta = 0.01, 

In [155]:
rgr = xgb.XGBRegressor(
                 colsample_bytree=1.,
                 eta=0.01,
                 max_depth=4,
                 min_child_weight=1.5,
                 n_estimators=14400,                                                                  
                 alpha=0.,
                 reg_lambda=0.4,
                 subsample=0.2)
rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)
print_scores(y_test, y_pred)

Mean Absolute Error: 0.003632161234013532
Mean Squared Error: 0.00013078424912496358
Root Mean Squared Error: 0.011436094137639982
