In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Prepairing dataset

In [3]:
def deleteId(ds):
    return ds.drop(columns='Id')

def deleteNaN(train_ds, test_ds, critval):
    fullsize = train_ds.shape[0]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in new_train.columns:
        nulls = new_train[feature].isnull().sum()
        percent = nulls / fullsize
        if (percent > critval):
            new_train = new_train.drop(columns=feature)
            new_test = new_test.drop(columns=feature)
    return new_train, new_test

def fillNaN(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    new_train = new_train.fillna(new_train.median())
    new_test = new_test.fillna(new_test.median())
    return new_train, new_test

def convertToNumeric(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    LE = LabelEncoder()
    for feature in new_train.columns[:-1]:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = LE.fit_transform(new_train[feature])
            new_test[feature] = LE.fit_transform(new_test[feature])
    return new_train, new_test

def check_error(preds, gt):
    print('Absolute Error:', metrics.mean_absolute_error(preds, gt))

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
train, test = convertToNumeric(train, test)

y_train = np.log1p(train['SalePrice'].values)
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, train_size=0.2, random_state=98987)

imp = SimpleImputer(strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

# PCA+Scaler

In [5]:
pca = PCA(n_components = 20)
X_train = pca.fit_transform(X_train, y_train)
X_test = pca.transform(X_test)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# RandomForest

In [6]:
parameters = {
    'criterion':('squared_error', 'absolute_error', 'poisson'), 
    'max_depth': (100,500,1000,1500, None),
    'max_features':('auto', 'sqrt', 'log2'),
    'n_estimators':(10,50,100),
    'min_samples_split':(2,5,9)}

model = RandomForestRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:10], y_train[:10])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'criterion': 'absolute_error', 'max_depth': 1000, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 10}


In [7]:
model = RandomForestRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.19222440787726564


# XGBoost

In [8]:
parameters = {
    "learning_rate": (0.1, 0.2, 0.3),
    "max_depth": [ 2, 3, 4, 5],
    "min_child_weight": [ 2, 4, 7],
    "gamma":[ 0.0, 0.1, 0.2],
    "n_estimators":(10,30,50,70,100),
    "colsample_bytree":[ 0.3, 0.4, 0.9]}

model = XGBRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:10], y_train[:10])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'colsample_bytree': 0.4, 'gamma': 0.0, 'learning_rate': 0.2, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 30}


In [11]:
model = XGBRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.16683775744308715


# LGBMRegressor

In [9]:
model = LGBMRegressor()

parameters = {
    'num_leaves': [i for i in range(3,30,5)],
    'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001],
    'max_depth': [3, 4, 5, 6, 7],
    'n_estimators': [50, 100],}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:10], y_train[:10])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'num_leaves': 3}


In [10]:
model = LGBMRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.18210141796190962


# CatBoost

In [22]:
def getCategorialFeatures(train_ds):
    cat_features = []
    iter = 0
    for feature in train_ds.columns[:-1]:
        if (train_ds[feature].dtype == 'object'):
            cat_features.append(iter)
        iter +=1 
    return cat_features

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
train, test = fillNaN(train, test) 
cat_features = getCategorialFeatures(train)

y_train = np.log1p(train['SalePrice'].values)
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1,train_size=0.2, random_state=98987)

imp = SimpleImputer(strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

  new_train = new_train.fillna(new_train.median())
  new_test = new_test.fillna(new_test.median())


In [25]:
model = CatBoostRegressor(cat_features=cat_features)

parameters = {
        'depth': [2],
        'loss_function': ['RMSE']}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:10], y_train[:10])
print(f"Best parameters: {gs_model.best_params_}")

Learning rate set to 0.019093
0:	learn: 0.3163875	total: 8.97ms	remaining: 8.96s
1:	learn: 0.3148674	total: 15ms	remaining: 7.49s
2:	learn: 0.3128143	total: 25.7ms	remaining: 8.54s
3:	learn: 0.3110307	total: 30.9ms	remaining: 7.71s
4:	learn: 0.3090818	total: 42.1ms	remaining: 8.38s
5:	learn: 0.3081447	total: 47.2ms	remaining: 7.82s
6:	learn: 0.3061656	total: 57.2ms	remaining: 8.11s
7:	learn: 0.3048259	total: 62.1ms	remaining: 7.7s
8:	learn: 0.3024870	total: 73ms	remaining: 8.04s
9:	learn: 0.3009728	total: 79.2ms	remaining: 7.84s
10:	learn: 0.2996607	total: 90.3ms	remaining: 8.12s
11:	learn: 0.2981938	total: 95.9ms	remaining: 7.89s
12:	learn: 0.2963886	total: 108ms	remaining: 8.2s
13:	learn: 0.2949067	total: 116ms	remaining: 8.19s
14:	learn: 0.2932185	total: 124ms	remaining: 8.17s
15:	learn: 0.2919886	total: 136ms	remaining: 8.38s
16:	learn: 0.2903353	total: 143ms	remaining: 8.26s
17:	learn: 0.2883343	total: 154ms	remaining: 8.4s
18:	learn: 0.2866965	total: 169ms	remaining: 8.74s
19:	le

In [26]:
model = CatBoostRegressor(**gs_model.best_params_,cat_features=cat_features)
model.fit(X_train, y_train, plot=True)
check_error(model.predict(X_test), y_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.033706
0:	learn: 0.3902615	total: 16.5ms	remaining: 16.5s
1:	learn: 0.3840565	total: 33.6ms	remaining: 16.7s
2:	learn: 0.3784724	total: 52.1ms	remaining: 17.3s
3:	learn: 0.3731220	total: 74.5ms	remaining: 18.6s
4:	learn: 0.3677189	total: 84.9ms	remaining: 16.9s
5:	learn: 0.3620342	total: 97.3ms	remaining: 16.1s
6:	learn: 0.3563482	total: 106ms	remaining: 15s
7:	learn: 0.3520074	total: 114ms	remaining: 14.1s
8:	learn: 0.3466955	total: 122ms	remaining: 13.4s
9:	learn: 0.3415975	total: 130ms	remaining: 12.9s
10:	learn: 0.3373764	total: 141ms	remaining: 12.6s
11:	learn: 0.3324607	total: 148ms	remaining: 12.2s
12:	learn: 0.3268642	total: 155ms	remaining: 11.8s
13:	learn: 0.3229119	total: 162ms	remaining: 11.4s
14:	learn: 0.3185698	total: 169ms	remaining: 11.1s
15:	learn: 0.3144878	total: 176ms	remaining: 10.8s
16:	learn: 0.3103264	total: 185ms	remaining: 10.7s
17:	learn: 0.3069393	total: 193ms	remaining: 10.5s
18:	learn: 0.3031656	total: 208ms	remaining: 10.8s
19:	lea