In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Prepairing dataset

In [2]:
def deleteId(ds):
    return ds.drop(columns='Id')

def deleteNaN(train_ds, test_ds, critval):
    fullsize = train_ds.shape[0]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in new_train.columns:
        nulls = new_train[feature].isnull().sum()
        percent = nulls / fullsize
        if (percent > critval):
            new_train = new_train.drop(columns=feature)
            new_test = new_test.drop(columns=feature)
    return new_train, new_test

def fillNaN(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    new_train = new_train.fillna(new_train.median())
    new_test = new_test.fillna(new_test.median())
    return new_train, new_test

def convertToNumeric(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    LE = LabelEncoder()
    for feature in new_train.columns[:-1]:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = LE.fit_transform(new_train[feature])
            new_test[feature] = LE.fit_transform(new_test[feature])
    return new_train, new_test

def check_error(preds, gt):
    print('Absolute Error:', metrics.mean_absolute_error(preds, gt))

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
train, test = convertToNumeric(train, test)

y_train = np.log1p(train['SalePrice'].values)
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

imp = SimpleImputer(strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

# PCA+Scaler

In [4]:
pca = PCA(n_components = 20)
X_train = pca.fit_transform(X_train, y_train)
X_test = pca.transform(X_test)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# RandomForest

In [5]:
parameters = {
    'criterion':('squared_error', 'absolute_error', 'poisson'), 
    'max_depth': (100,500,1000,1500, None),
    'max_features':('auto', 'sqrt', 'log2'),
    'n_estimators':(10,50,100),
    'min_samples_split':(2,5,9)}

model = RandomForestRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'criterion': 'absolute_error', 'max_depth': None, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 50}


In [6]:
model = RandomForestRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.14223066517684405


# XGBoost

In [7]:
parameters = {
    "learning_rate": (0.1, 0.2, 0.3),
    "max_depth": [ 2, 3, 4, 5],
    "min_child_weight": [ 2, 4, 7],
    "gamma":[ 0.0, 0.1, 0.2],
    "n_estimators":(10,50,100),
    "colsample_bytree":[ 0.3, 0.4, 0.9]}

model = XGBRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'colsample_bytree': 0.4, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 7, 'n_estimators': 100}


In [8]:
model = XGBRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.1524584624571498


# LGBMRegressor

In [9]:
model = LGBMRegressor()

parameters = {
    'num_leaves': [i for i in range(3,30,5)],
    'learning_rate': [0.1, 0.05, 0.01, 0.005, 0.001],
    'max_depth': [3, 4, 5, 6, 7],
    'n_estimators': [50, 100],}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 100, 'num_leaves': 13}


In [10]:
model = LGBMRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.14600321613063041


# CatBoost

In [11]:
def getCategorialFeatures(train_ds):
    cat_features = []
    iter = 0
    for feature in train_ds.columns[:-1]:
        if (train_ds[feature].dtype == 'object'):
            cat_features.append(iter)
        iter +=1 
    return cat_features

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train)  
train, test = deleteNaN(train, test ,critval=0.7)
train, test = fillNaN(train, test) 
cat_features = getCategorialFeatures(train)

y_train = np.log1p(train['SalePrice'].values)
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1,train_size=0.2, random_state=98987)

imp = SimpleImputer(strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.fit_transform(X_test)

  new_train = new_train.fillna(new_train.median())
  new_test = new_test.fillna(new_test.median())


In [12]:
model = CatBoostRegressor(cat_features=cat_features)

parameters = {
        'depth': [2],
        'loss_function': ['RMSE']}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Learning rate set to 0.032525
0:	learn: 0.3909217	total: 68.3ms	remaining: 1m 8s
1:	learn: 0.3860391	total: 75.6ms	remaining: 37.7s
2:	learn: 0.3805186	total: 81.9ms	remaining: 27.2s
3:	learn: 0.3744929	total: 87.3ms	remaining: 21.7s
4:	learn: 0.3701534	total: 93.6ms	remaining: 18.6s
5:	learn: 0.3645078	total: 98.8ms	remaining: 16.4s
6:	learn: 0.3584151	total: 105ms	remaining: 14.9s
7:	learn: 0.3534728	total: 111ms	remaining: 13.7s
8:	learn: 0.3482828	total: 118ms	remaining: 12.9s
9:	learn: 0.3432698	total: 124ms	remaining: 12.2s
10:	learn: 0.3382644	total: 129ms	remaining: 11.6s
11:	learn: 0.3334993	total: 136ms	remaining: 11.2s
12:	learn: 0.3287704	total: 142ms	remaining: 10.8s
13:	learn: 0.3250912	total: 148ms	remaining: 10.4s
14:	learn: 0.3206767	total: 153ms	remaining: 10.1s
15:	learn: 0.3166403	total: 160ms	remaining: 9.84s
16:	learn: 0.3120790	total: 166ms	remaining: 9.6s
17:	learn: 0.3077750	total: 172ms	remaining: 9.4s
18:	learn: 0.3039228	total: 178ms	remaining: 9.19s
19:	lea

In [13]:
model = CatBoostRegressor(**gs_model.best_params_,cat_features=cat_features)
model.fit(X_train, y_train, plot=True)
check_error(model.predict(X_test), y_test)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.033706
0:	learn: 0.3902615	total: 4.9ms	remaining: 4.89s
1:	learn: 0.3840565	total: 11.6ms	remaining: 5.78s
2:	learn: 0.3784724	total: 17.9ms	remaining: 5.94s
3:	learn: 0.3731220	total: 23.7ms	remaining: 5.9s
4:	learn: 0.3677189	total: 29.3ms	remaining: 5.83s
5:	learn: 0.3620342	total: 35.7ms	remaining: 5.92s
6:	learn: 0.3563482	total: 41.5ms	remaining: 5.89s
7:	learn: 0.3520074	total: 47ms	remaining: 5.83s
8:	learn: 0.3466955	total: 52.4ms	remaining: 5.77s
9:	learn: 0.3415975	total: 58.3ms	remaining: 5.78s
10:	learn: 0.3373764	total: 63.8ms	remaining: 5.74s
11:	learn: 0.3324607	total: 69.3ms	remaining: 5.71s
12:	learn: 0.3268642	total: 75.1ms	remaining: 5.7s
13:	learn: 0.3229119	total: 81.9ms	remaining: 5.77s
14:	learn: 0.3185698	total: 88.1ms	remaining: 5.79s
15:	learn: 0.3144878	total: 93.8ms	remaining: 5.77s
16:	learn: 0.3103264	total: 99.5ms	remaining: 5.75s
17:	learn: 0.3069393	total: 105ms	remaining: 5.74s
18:	learn: 0.3031656	total: 111ms	remaining: 5.72s