# Kaggle Tuning

## Import

In [9]:
import os
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt
import time

## Set Random Seed

In [10]:
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
seed = 1
set_all_seeds(seed)

## Data Processing

In [11]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name','name']
train_data = train_data.drop(drop_columns, axis=1)

# avoid err in xgboost
train_data['host_verifications'] = train_data['host_verifications'].str.replace('[', '(')
train_data['host_verifications'] = train_data['host_verifications'].str.replace(']', ')')

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)

# onehot encode
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns_to_encode = [col for col in categorical_columns if len(train_data[col].unique()) <= 20]
train_data = pd.get_dummies(train_data, columns=categorical_columns_to_encode, drop_first=True)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# extract numbers from bathrooms_text
train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='Half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

x = train_data.drop(['price','property_type','neighbourhood_cleansed', 'amenities'], axis=1)
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
gmm = GaussianMixture(n_components=6, covariance_type='full', init_params='kmeans', random_state=seed).fit(x)
train_data['gmm_cluster'] = gmm.predict(x)

train_data = pd.get_dummies(train_data, columns=['property_type','neighbourhood_cleansed'], drop_first=True)

# advance drop cols
train_data = train_data.drop(['minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights',
                              'minimum_maximum_nights','maximum_maximum_nights','availability_60','availability_90',
                              'number_of_reviews_ltm','number_of_reviews_l30d'], axis=1)

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

# deal with amenities
amenities_cols = ['Lock on bedroom door', 'Indoor fireplace', 'Dishwasher', 'BBQ grill', 'Barbecue utensils', 'Fire pit',
                  'Outdoor dining area', 'Sun loungers', 'Outdoor furniture', 'Private patio or balcony', 'Private backyard – Fully fenced', 'Pool']

for i in amenities_cols: 
    train_data[i] = 0
for i in train_data.index:
    for item in amenities_cols:
        if item in ast.literal_eval(train_data.loc[i, 'amenities']):
            train_data.loc[i, item] = 1
train_data = train_data.drop(['amenities'], axis=1)

for i in amenities_cols: 
    test_data[i] = 0
for i in test_data.index:
    for item in amenities_cols:
        if item in ast.literal_eval(test_data.loc[i, 'amenities']):
            test_data.loc[i, item] = 1
test_data = test_data.drop(['amenities'], axis=1)

## Tuning RF

### Tune n_estimators

In [4]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
}

rf = RandomForestClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for RF:\n", grid.best_score_)
print("Best parameters for RF:\n", grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best score across for RF:
 0.5532931897551331
Best parameters for RF:
 {'n_estimators': 900}


### Tune max_depth

In [5]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'n_estimators': [900],
    'max_depth': [5, 10, 15, 20, 25, 30, None]
}

rf = RandomForestClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for RF:\n", grid.best_score_)
print("Best parameters for RF:\n", grid.best_params_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Best score across for RF:
 0.5532931897551331
Best parameters for RF:
 {'max_depth': None, 'n_estimators': 900}


### Tune min_samples_leaf

In [6]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'n_estimators': [900],
    'max_depth': [None],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

rf = RandomForestClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for RF:\n", grid.best_score_)
print("Best parameters for RF:\n", grid.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best score across for RF:
 0.5532931897551331
Best parameters for RF:
 {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 900}


### Tune min_samples_split

In [9]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'n_estimators': [900],
    'max_depth': [None],
    'min_samples_leaf': [1],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10]
}

rf = RandomForestClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for RF:\n", grid.best_score_)
print("Best parameters for RF:\n", grid.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score across for RF:
 0.5532931897551331
Best parameters for RF:
 {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 900}


### Tune class_weight

In [10]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'n_estimators': [900],
    'max_depth': [None],
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

rf = RandomForestClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for RF:\n", grid.best_score_)
print("Best parameters for RF:\n", grid.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best score across for RF:
 0.5539357240702674
Best parameters for RF:
 {'class_weight': 'balanced_subsample', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 900}


## Tuning Xgboost

### Tune max_depth

In [5]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'max_depth': [10, 20, None]
}

xgboost = XGBClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=xgboost, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for Xgboost:\n", grid.best_score_)
print("Best parameters for Xgboost:\n", grid.best_params_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best score across for Xgboost:
 0.5614793442431969
Best parameters for Xgboost:
 {'max_depth': 10}


### Tune n_estimators and learning_rate

In [6]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'max_depth': [10],
    'n_estimators': [300, 500, 800],
    'learning_rate': [0.05, 0.3, 1]
}

xgboost = XGBClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=xgboost, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for Xgboost:\n", grid.best_score_)
print("Best parameters for Xgboost:\n", grid.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score across for Xgboost:
 0.5682965977278766
Best parameters for Xgboost:
 {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 800}


### Tune min_child_weight and subsample

In [7]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'max_depth': [10],
    'n_estimators': [800],
    'learning_rate': [0.05],
    'min_child_weight': [0.4, 0.6, 1],
    'subsample': [0.2, 0.4, 0.6, 0.8, 1]
}

xgboost = XGBClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=xgboost, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for Xgboost:\n", grid.best_score_)
print("Best parameters for Xgboost:\n", grid.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best score across for Xgboost:
 0.5696654052539721
Best parameters for Xgboost:
 {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 0.4, 'n_estimators': 800, 'subsample': 0.6}


### Tune gamma

In [5]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'max_depth': [10],
    'n_estimators': [800],
    'learning_rate': [0.05],
    'min_child_weight': [0.4],
    'subsample': [0.6],
    'gamma': [0, 0.2, 0.4, 0.6, 0.8]
}

xgboost = XGBClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=xgboost, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for Xgboost:\n", grid.best_score_)
print("Best parameters for Xgboost:\n", grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best score across for Xgboost:
 0.5699934924611786
Best parameters for Xgboost:
 {'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 0.4, 'n_estimators': 800, 'subsample': 0.6}


### Tune colsample_bylevel and colsample_bytree

In [12]:
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

params = {
    'max_depth': [10],
    'n_estimators': [800],
    'learning_rate': [0.05],
    'min_child_weight': [0.4],
    'subsample': [0.6],
    'gamma': [0.2],
    'colsample_bylevel': [0.4, 0.6, 0.8],
    'colsample_bytree': [0.4, 0.6, 0.8]
}

xgboost = XGBClassifier(n_jobs=-1, random_state=seed)
grid = GridSearchCV(estimator=xgboost, param_grid=params, cv=5, n_jobs=-1, scoring='f1_macro', verbose=10)
grid.fit(x_train,y_train)

print("Best score across for Xgboost:\n", grid.best_score_)
print("Best parameters for Xgboost:\n", grid.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best score across for Xgboost:
 0.5715224601130611
Best parameters for Xgboost:
 {'colsample_bylevel': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 0.4, 'n_estimators': 800, 'subsample': 0.6}
