# Kaggle

## Import

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

## Set Random Seed

In [2]:
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed = 1
set_all_seeds(seed)

## Data Analysis

In [3]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name','name']
train_data = train_data.drop(drop_columns, axis=1)

# avoid err in xgboost
train_data['host_verifications'] = train_data['host_verifications'].str.replace('[', '(')
train_data['host_verifications'] = train_data['host_verifications'].str.replace(']', ')')

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)

# # label encode
# label_encoder = LabelEncoder()
# train_data['host_is_superhost'] = label_encoder.fit_transform(train_data['host_is_superhost'])
# train_data['host_has_profile_pic'] = label_encoder.fit_transform(train_data['host_has_profile_pic'])
# train_data['host_identity_verified'] = label_encoder.fit_transform(train_data['host_identity_verified'])
# train_data['has_availability'] = label_encoder.fit_transform(train_data['has_availability'])
# train_data['instant_bookable'] = label_encoder.fit_transform(train_data['instant_bookable'])

# onehot encode
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns_to_encode = [col for col in categorical_columns if len(train_data[col].unique()) <= 20]
train_data = pd.get_dummies(train_data, columns=categorical_columns_to_encode, drop_first=True)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))
train_data = train_data.drop('amenities', axis=1)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# extract numbers from bathrooms_text
# train_data['bathrooms_shared'] = train_data['bathrooms_text'].isin(['shared', 'Shared'])
# train_data['bathrooms_private'] = train_data['bathrooms_text'].isin(['private', 'Private'])
train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='Half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

x = train_data.drop(['price','property_type','neighbourhood_cleansed'], axis=1)
scaler = MinMaxScaler()
x = scaler.fit_transform(x)
gmm = GaussianMixture(n_components=6, covariance_type='full', init_params='kmeans', random_state=seed).fit(x)
train_data['gmm_cluster'] = gmm.predict(x)

train_data = pd.get_dummies(train_data, columns=['property_type','neighbourhood_cleansed'], drop_first=True)

# advance drop cols
train_data = train_data.drop(['minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights',
                              'minimum_maximum_nights','maximum_maximum_nights','availability_60','availability_90',
                              'number_of_reviews_ltm','number_of_reviews_l30d'], axis=1)

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

In [4]:
# get train data for RF
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

rf = RandomForestClassifier(max_depth=None, n_estimators=500, n_jobs=-1, class_weight='balanced', random_state=seed)
pipeline_rf = make_pipeline(SMOTE(random_state=seed), rf)
xgboost = XGBClassifier(booster='gbtree', tree_method='auto', n_jobs=-1, 
                        subsample=0.6, n_estimators=500, learning_rate=0.05, 
                        min_child_weight=0.4, max_depth=10, gamma=0.5, 
                        colsample_bylevel=0.8, colsample_bytree=0.6, random_state=seed)

estimators = [
    ('pipeline_rf', pipeline_rf),
    ('xgboost', xgboost)
]

pipeline_SVC = Pipeline([
    ('scaler', StandardScaler()), 
    ('svm', SVC(kernel='linear', random_state=seed))
])

stack = StackingClassifier(estimators=estimators, final_estimator=pipeline_SVC, n_jobs=-1)
stack.fit(x_train, y_train)
scores = cross_val_score(stack, x_train, y_train, cv=5, scoring='accuracy')
print(scores.mean())

0.5774740379220913


In [5]:
# test
y_pred = stack.predict(test_data)
test_sub = pd.read_csv("data/test.csv")
sub_drop_list = test_sub.columns.tolist()
sub_drop_list.remove('id')
sub_drop_list = pd.Index(sub_drop_list)
test_sub = test_sub.drop(sub_drop_list, axis=1)
test_sub['price'] = pd.DataFrame(y_pred)
test_sub.to_csv('output.csv', index=False)