# Kaggle

## Import

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

## Set Random Seed

In [None]:
seed = 666
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(seed)

## Data Analysis

In [None]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name']
train_data = train_data.drop(drop_columns, axis=1)

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')
train_data['name'] = train_data['name'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)
train_data['name_length'] = train_data['name'].apply(len)
train_data = train_data.drop('name', axis=1)

# label encode
label_encoder = LabelEncoder()
train_data['host_is_superhost'] = label_encoder.fit_transform(train_data['host_is_superhost'])

# onehot encode
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns_to_encode = [col for col in categorical_columns if len(train_data[col].unique()) <= 20]
train_data = pd.get_dummies(train_data, columns=categorical_columns_to_encode, drop_first=True)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))
train_data = train_data.drop('amenities', axis=1)

train_data = pd.get_dummies(train_data, columns=['property_type','neighbourhood_cleansed'], drop_first=True)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# extract numbers from bathrooms_text
train_data['bathrooms_shared'] = train_data['bathrooms_text'].isin(['shared', 'Shared'])
train_data['bathrooms_private'] = train_data['bathrooms_text'].isin(['private', 'Private'])
train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='Half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

train_data["host_verifications_('email', 'phone')"]=train_data["host_verifications_['email', 'phone']"]
train_data["host_verifications_('email', 'work_email')"]=train_data["host_verifications_['email', 'work_email']"]
train_data["host_verifications_('email')"]=train_data["host_verifications_['email']"]
train_data["host_verifications_('phone', 'work_email')"]=train_data["host_verifications_['phone', 'work_email']"]
train_data["host_verifications_('phone')"]=train_data["host_verifications_['phone']"]
train_data["host_verifications_()"]=train_data["host_verifications_[]"]
train_data = train_data.drop("host_verifications_['email', 'phone']", axis=1)
train_data = train_data.drop("host_verifications_['email', 'work_email']", axis=1)
train_data = train_data.drop("host_verifications_['email']", axis=1)
train_data = train_data.drop("host_verifications_['phone', 'work_email']", axis=1)
train_data = train_data.drop("host_verifications_['phone']", axis=1)
train_data = train_data.drop("host_verifications_[]", axis=1)

# train_data['number_of_reviews'] += train_data['number_of_reviews_ltm'] + train_data['number_of_reviews_l30d']
# train_data = train_data.drop("number_of_reviews_ltm", axis=1)
# train_data = train_data.drop("number_of_reviews_l30d", axis=1)

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

## Random Forest

### CV

In [None]:
# get train data for RF
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')

# params to tune
# xgboost_hyperparam = {
#     'booster': ['gbtree'],
#     'tree_method': ['auto'],
#     'n_estimators': [500],
#     'eta': [0.05],
#     'max_depth': [10],
#     'gamma': [0.5]
# }

xgboost_hyperparam = {
    'booster': ['gbtree'],
    'tree_method': ['auto'],
    'n_estimators': [200],
    'max_depth': [10],
    'learning_rate' : [0.05],
    'reg_lambda': [1],
    'gamma': [0.5]
}

xgboost_model = XGBClassifier(random_state=seed)
xgboost_grid = GridSearchCV(xgboost_model, xgboost_hyperparam, cv=5,scoring='f1_macro')
xgboost_grid.fit(x_train, y_train)
print(xgboost_grid.best_score_)
print(xgboost_grid.best_params_)
best_xgboost_model = xgboost_grid.best_estimator_

scores = cross_val_score(best_xgboost_model, x_train, y_train, cv=5, scoring='accuracy')
print(scores.mean())
# 0.5662197635936979

### Deal with Imbalanced Data

In [None]:
# # get train data for RF
# x_train = train_data.drop("price", axis=1)
# y_train = train_data['price'].astype('int64')

# # deal with imbalanced data
# smote = SMOTE(random_state=seed)
# x_train, y_train = smote.fit_resample(x_train, y_train)
# # print(Counter(y_train))

# best_xgboost_model = XGBClassifier(n_estimators=500,max_depth=10,learning_rate=0.05,reg_lambda=1,random_state=seed)
# best_xgboost_model.fit(x_train, y_train)

In [None]:
# # test
# y_pred = best_xgboost_model.predict(test_data)
# test_sub = pd.read_csv("data/test.csv")
# sub_drop_list = test_sub.columns.tolist()
# sub_drop_list.remove('id')
# sub_drop_list = pd.Index(sub_drop_list)
# test_sub = test_sub.drop(sub_drop_list, axis=1)
# test_sub['price'] = pd.DataFrame(y_pred)
# test_sub.to_csv('output.csv', index=False)