In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Prepairing dataset

In [2]:
def deleteId(ds):
    return ds.drop(columns='Id')

def deleteNaN(train_ds, test_ds, critval):
    fullsize = train_ds.shape[0]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in new_train.columns:
        nulls = new_train[feature].isnull().sum()
        percent = nulls / fullsize
        if (percent > critval):
            #print(f'Feature {feature} was removed: \nNaNs {(percent * 100):.2f}%')
            new_train = new_train.drop(columns=feature)
            new_test = new_test.drop(columns=feature)
    return new_train, new_test

def fillNaN(train_ds, test_ds, method='ffill'):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    new_train = new_train.fillna(method=method)
    new_test = new_test.fillna(method=method)
    return new_train, new_test

def convertToNumeric(train_ds, test_ds):
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    LE = LabelEncoder()
    for feature in new_train.columns[:-1]:
        if (new_train[feature].dtype == 'object'):
            new_train[feature] = LE.fit_transform(new_train[feature])
            new_test[feature] = LE.fit_transform(new_test[feature])
    return new_train, new_test

def check_error(preds, gt):
    print('Absolute Error:', metrics.mean_absolute_error(preds, gt))
    print('Squared Error:', metrics.mean_squared_error(preds, gt))
    print('Squared Log Error:', metrics.mean_squared_log_error(preds, gt))

def deleteCorrelation(train_ds, test_ds, threshold):
    corr_matrix = train_ds.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    new_train = train_ds.copy()
    new_test = test_ds.copy()
    for feature in to_drop:
        new_train = new_train.drop(columns=feature)
        new_test = new_test.drop(columns=feature)

    return new_train, new_test


In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
train = train.drop_duplicates()

train = deleteId(train) 
train, test = deleteNaN(train, test ,critval=0.4)   
train, test = fillNaN(train, test) 
train, test = convertToNumeric(train, test) 
train, test = deleteCorrelation(train, test, threshold=0.8) 

y_train = np.log1p(train['SalePrice'].values)
x_train = train.drop(columns='SalePrice').values
x_super_test = test.values

X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1, random_state=98987)

# PCA

In [4]:
pca = PCA(n_components = 20)
X_train = pca.fit_transform(X_train, y_train)
X_test = pca.transform(X_test)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# RandomForest

In [6]:
parameters = {
    'criterion':('squared_error', 'absolute_error', 'poisson'), 
    'max_depth': (100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, None),
    'max_features':('auto', 'sqrt', 'log2')}

model = RandomForestRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'criterion': 'squared_error', 'max_depth': 500, 'max_features': 'sqrt'}


In [7]:
model = RandomForestRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.14882760244251803
Squared Error: 0.04738547632888391
Squared Log Error: 0.0002910504625439407


# XGBoost

In [8]:
parameters = {
    "learning_rate": (0.05, 0.10, 0.15, 0.2),
    "max_depth": [ 3, 4, 5, 6, 8],
    "min_child_weight": [ 1, 2, 3, 4, 5, 7],
    "gamma":[ 0.0, 0.1, 0.2],
    "colsample_bytree":[ 0.3, 0.4, 0.5, 0.6, 0.7]}

model = XGBRegressor()
gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'colsample_bytree': 0.4, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 3}


In [9]:
model = XGBRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.14653263485496532
Squared Error: 0.042272804397422155
Squared Log Error: 0.00026190108064463657


# CatBoost

In [10]:
model = CatBoostRegressor()

parameters = {
        'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        'logging_level': ['Silent']}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.1, 'logging_level': 'Silent'}


In [11]:
model = CatBoostRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.13285845648348427
Squared Error: 0.03831134038325577
Squared Log Error: 0.00023828232203326397


In [12]:
model = LGBMRegressor()

parameters = {
    'num_leaves': [i for i in range(3,30)],
    'learning_rate': [0.1, 0.07, 0.05, 0.03, 0.01, 0.007, 0.005, 0.003, 0.001],
    'max_depth': [3, 4, 5, 6, 7],
    'n_estimators': [50, 100, 200, 250, 300, 400, 500],}

gs_model = GridSearchCV(model, parameters)
gs_model.fit(X_train[:300], y_train[:300])
print(f"Best parameters: {gs_model.best_params_}")

Best parameters: {'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 200, 'num_leaves': 7}


In [13]:
model = LGBMRegressor(**gs_model.best_params_)
model.fit(X_train, y_train)
check_error(model.predict(X_test), y_test)

Absolute Error: 0.14555054351248695
Squared Error: 0.04647049885139293
Squared Log Error: 0.0002858500424465845
