In [59]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Prepairing dataset

In [60]:
def deleteId(ds):
    return ds.drop(columns='Id')

def deleteNaN(train_ds, test_ds, critval):
    fullsize = train_ds.shape[0]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in new_train.columns:
        nulls = new_train[feature].isnull().sum()
        percent = nulls / fullsize
        if (percent > critval):
            new_train = new_train.drop(columns=feature)
            new_test = new_test.drop(columns=feature)
    return new_train, new_test

def convertToNumeric(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    LE = LabelEncoder()
    for feature in new_train.columns[:-1]:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = LE.fit_transform(new_train[feature])
            new_test[feature] = LE.fit_transform(new_test[feature])
    return new_train, new_test

def check_error(preds, gt):
    print('RMSE Error:', mean_squared_error(np.log(preds), np.log(gt), squared=False))

In [61]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
train, test = convertToNumeric(train, test)

y_train = train['SalePrice'].values
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

imp = SimpleImputer(strategy='mean')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

# PCA+Scaler

In [49]:
pca = PCA(n_components = 20)
X_train = pca.fit_transform(X_train, y_train)
X_test = pca.transform(X_test)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# RandomForest

In [62]:
parameters = {
    'criterion':(['squared_error']), 
    'max_depth': (1000, 1500, 2000, None),
    'max_features':('auto', 'sqrt', 'log2'),
    'n_estimators':(5, 10, 20, 50),
    'min_samples_split':(2, 5, 9)}

model = RandomForestRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'criterion': 'squared_error', 'max_depth': 1500, 'max_features': 'sqrt', 'min_samples_split': 9, 'n_estimators': 5}


In [64]:
model = RandomForestRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

RMSE Error: 0.15506982002279504


# XGBoost

In [78]:
parameters = {
    "learning_rate": (0.05, 0.1, 0.15),
    "max_depth": [4, 5, 6],
    "min_child_weight": [7, 9, 12],
    "n_estimators":(30,50, 70)}

model = XGBRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'learning_rate': 0.05, 'max_depth': 6, 'min_child_weight': 12, 'n_estimators': 70}


In [68]:
model = XGBRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

RMSE Error: 0.13154656698300676


# LGBMRegressor

In [69]:
model = LGBMRegressor()

parameters = {
    'num_leaves': [i for i in range(3,15,2)],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5, 6, 7],
    'n_estimators': [50, 100],}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'num_leaves': 3}


In [72]:
model = LGBMRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

RMSE Error: 0.1422443940135451


# CatBoost

In [21]:
def getCategorialFeatures(train_ds):
    cat_features = []
    iter = 0
    for feature in train_ds.columns[:-1]:
        if (train_ds[feature].dtype == 'object'):
            cat_features.append(iter)
        iter +=1 
    return cat_features

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
cat_features = getCategorialFeatures(train)

y_train = train['SalePrice'].values
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

imp = SimpleImputer(strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

In [23]:
model = CatBoostRegressor(loss_function='RMSE',cat_features=cat_features)
model.fit(X_train, y_train, plot=True)
check_error(model.predict(X_test), y_test)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.042748
0:	learn: 76930.4091113	total: 83.2ms	remaining: 1m 23s
1:	learn: 74859.8884342	total: 170ms	remaining: 1m 24s
2:	learn: 73006.7045395	total: 232ms	remaining: 1m 17s
3:	learn: 71005.1089006	total: 326ms	remaining: 1m 21s
4:	learn: 69301.2682476	total: 383ms	remaining: 1m 16s
5:	learn: 67572.8459105	total: 442ms	remaining: 1m 13s
6:	learn: 65919.7517106	total: 519ms	remaining: 1m 13s
7:	learn: 64315.7210541	total: 596ms	remaining: 1m 13s
8:	learn: 62720.0306505	total: 663ms	remaining: 1m 13s
9:	learn: 61306.0702768	total: 744ms	remaining: 1m 13s
10:	learn: 59899.0116968	total: 828ms	remaining: 1m 14s
11:	learn: 58540.3382105	total: 908ms	remaining: 1m 14s
12:	learn: 57218.6861287	total: 1s	remaining: 1m 15s
13:	learn: 55974.9360650	total: 1.06s	remaining: 1m 15s
14:	learn: 54755.6045223	total: 1.13s	remaining: 1m 14s
15:	learn: 53554.8727241	total: 1.2s	remaining: 1m 13s
16:	learn: 52401.9591590	total: 1.26s	remaining: 1m 12s
17:	learn: 51280.5153216	total: