In [37]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Prepairing dataset

In [38]:
def deleteId(ds):
    return ds.drop(columns='Id')

def deleteNaN(train_ds, test_ds, critval):
    fullsize = train_ds.shape[0]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in new_train.columns:
        nulls = new_train[feature].isnull().sum()
        percent = nulls / fullsize
        if (percent > critval):
            new_train = new_train.drop(columns=feature)
            new_test = new_test.drop(columns=feature)
    return new_train, new_test

def convertToNumeric(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    LE = LabelEncoder()
    for feature in new_train.columns[:-1]:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = LE.fit_transform(new_train[feature])
            new_test[feature] = LE.fit_transform(new_test[feature])
    return new_train, new_test

def check_error(preds, gt):
    print('RMSE Error:', mean_squared_error(np.log(preds), np.log(gt), squared=False))

In [39]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
train, test = convertToNumeric(train, test)

y_train = train['SalePrice'].values
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

imp = SimpleImputer(strategy='mean')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

# PCA+Scaler

In [40]:
pca = PCA(n_components = 20)
X_train = pca.fit_transform(X_train, y_train)
X_test = pca.transform(X_test)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# RandomForest

In [41]:
parameters = {
    'criterion':('squared_error', 'absolute_error', 'poisson'), 
    'max_depth': (100,500,1000,1500, None),
    'max_features':('auto', 'sqrt', 'log2'),
    'n_estimators':(10,50,100),
    'min_samples_split':(2,5,9)}

model = RandomForestRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'criterion': 'absolute_error', 'max_depth': 1500, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 10}


In [42]:
model = RandomForestRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

RMSE Error: 0.21110398726965046


# XGBoost

In [43]:
parameters = {
    "learning_rate": (0.1, 0.2, 0.3),
    "max_depth": [ 2, 3, 4, 5],
    "min_child_weight": [ 2, 4, 7],
    "gamma":[ 0.0, 0.1, 0.2],
    "n_estimators":(10,50,100),
    "colsample_bytree":[ 0.3, 0.4, 0.9]}

model = XGBRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'colsample_bytree': 0.9, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 7, 'n_estimators': 50}


In [44]:
model = XGBRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

RMSE Error: 0.2097148095085528


# LGBMRegressor

In [45]:
model = LGBMRegressor()

parameters = {
    'num_leaves': [i for i in range(3,30,5)],
    'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001],
    'max_depth': [3, 4, 5, 6, 7],
    'n_estimators': [50, 100],}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 50, 'num_leaves': 8}


In [46]:
model = LGBMRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

RMSE Error: 0.24948807996010364


# CatBoost

In [52]:
def getCategorialFeatures(train_ds):
    cat_features = []
    iter = 0
    for feature in train_ds.columns[:-1]:
        if (train_ds[feature].dtype == 'object'):
            cat_features.append(iter)
        iter +=1 
    return cat_features

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
cat_features = getCategorialFeatures(train)

y_train = train['SalePrice'].values
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

imp = SimpleImputer(strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

In [53]:
model = CatBoostRegressor(cat_features=cat_features)

parameters = {
        'depth': [2],
        'loss_function': ['RMSE']}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Learning rate set to 0.032678
0:	learn: 80304.2290986	total: 6.25ms	remaining: 6.24s
1:	learn: 79152.8758505	total: 12.7ms	remaining: 6.34s
2:	learn: 77951.0570158	total: 18.1ms	remaining: 6.02s
3:	learn: 76829.1487289	total: 23.6ms	remaining: 5.88s
4:	learn: 75931.3901089	total: 29.5ms	remaining: 5.87s
5:	learn: 75030.7842369	total: 34.8ms	remaining: 5.77s
6:	learn: 74224.6844930	total: 40.9ms	remaining: 5.8s
7:	learn: 73264.5085158	total: 47.8ms	remaining: 5.93s
8:	learn: 72441.7373566	total: 56.7ms	remaining: 6.25s
9:	learn: 71735.7766226	total: 67.6ms	remaining: 6.69s
10:	learn: 70889.3046679	total: 73.3ms	remaining: 6.59s
11:	learn: 70096.5594174	total: 79.3ms	remaining: 6.53s
12:	learn: 69344.2181796	total: 87.1ms	remaining: 6.61s
13:	learn: 68404.5568097	total: 93.1ms	remaining: 6.55s
14:	learn: 67593.3675326	total: 101ms	remaining: 6.64s
15:	learn: 66831.7014236	total: 107ms	remaining: 6.58s
16:	learn: 66077.2225611	total: 115ms	remaining: 6.64s
17:	learn: 65307.5255299	total: 

In [54]:
model = CatBoostRegressor(**gs_model.best_params_,cat_features=cat_features)
model.fit(X_train, y_train, plot=True)
check_error(model.predict(X_test), y_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.042748
0:	learn: 77245.1606749	total: 9.32ms	remaining: 9.31s
1:	learn: 75575.1944707	total: 19.6ms	remaining: 9.79s
2:	learn: 73901.3644552	total: 34.9ms	remaining: 11.6s
3:	learn: 72376.2873162	total: 46ms	remaining: 11.5s
4:	learn: 70965.0699812	total: 53.1ms	remaining: 10.6s
5:	learn: 69543.8575924	total: 63.3ms	remaining: 10.5s
6:	learn: 68195.6638398	total: 72.6ms	remaining: 10.3s
7:	learn: 66824.1876059	total: 85.6ms	remaining: 10.6s
8:	learn: 65488.1380143	total: 92.9ms	remaining: 10.2s
9:	learn: 64279.9805333	total: 103ms	remaining: 10.2s
10:	learn: 63186.7754908	total: 110ms	remaining: 9.86s
11:	learn: 62068.3666028	total: 119ms	remaining: 9.83s
12:	learn: 60930.6437650	total: 126ms	remaining: 9.57s
13:	learn: 59865.3498047	total: 135ms	remaining: 9.51s
14:	learn: 58835.3493514	total: 142ms	remaining: 9.3s
15:	learn: 57833.2607714	total: 151ms	remaining: 9.28s
16:	learn: 56811.5727769	total: 158ms	remaining: 9.12s
17:	learn: 55981.0901951	total: 169ms	r