# Kaggle

## Import

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

## Set Random Seed

In [2]:
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(666)

## Data Analysis

In [3]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name','name']
train_data = train_data.drop(drop_columns, axis=1)

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)

# label encode
label_encoder = LabelEncoder()
train_data['host_is_superhost'] = label_encoder.fit_transform(train_data['host_is_superhost'])

# onehot encode
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns_to_encode = [col for col in categorical_columns if len(train_data[col].unique()) <= 20]
train_data = pd.get_dummies(train_data, columns=categorical_columns_to_encode, drop_first=True)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))
train_data = train_data.drop('amenities', axis=1)

train_data = pd.get_dummies(train_data, columns=['property_type','neighbourhood_cleansed'], drop_first=True)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# extract numbers from bathrooms_text
# train_data['bathrooms_shared'] = train_data['bathrooms_text'].isin(['shared', 'Shared'])
# train_data['bathrooms_private'] = train_data['bathrooms_text'].isin(['private', 'Private'])
# train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='half', value='0.5', regex=True)
# train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='Half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

## Random Forest

### CV

In [4]:
# params to tune
param_grid = {
    'n_estimators': [500],
    'max_depth': [None]
}

for seed in np.arange(0, 1000, 10):
    set_all_seeds(int(seed))
    
    # get train data for RF
    x_train = train_data.drop("price", axis=1)
    y_train = train_data['price'].astype('int64')
    
    rf = RandomForestClassifier(random_state=seed)
    grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search_rf.fit(x_train, y_train)
    print(seed, grid_search_rf.best_score_)
    
    # get train data for RF
    x_train = train_data.drop("price", axis=1)
    y_train = train_data['price'].astype('int64')
    
    # deal with imbalanced data
    smote = SMOTE(random_state=seed)
    x_train, y_train = smote.fit_resample(x_train, y_train)
    # print(Counter(y_train))
    
    rf = RandomForestClassifier(random_state=seed, n_estimators=500, max_depth=None)
    rf.fit(x_train, y_train)
    
    # test
    y_pred = rf.predict(test_data)
    test_sub = pd.read_csv("data/test.csv")
    sub_drop_list = test_sub.columns.tolist()
    sub_drop_list.remove('id')
    sub_drop_list = pd.Index(sub_drop_list)
    test_sub = test_sub.drop(sub_drop_list, axis=1)
    test_sub['price'] = pd.DataFrame(y_pred)
    test_sub.to_csv('outputs/output_' + str(seed) + '.csv', index=False)

0 0.5664140012110497
10 0.5644711847320509
20 0.5653129439746921
30 0.5666082597953292
40 0.5647947463588834
50 0.5643414833178737
60 0.5656367991385108
70 0.5657013982424683
80 0.56395300808317
90 0.5699760515752873
100 0.5646006555100248
110 0.5675793849813059
120 0.5658958664960239
130 0.5662198055275531
140 0.564795144730508
150 0.567579594650582
160 0.5679681327860686
170 0.5654426244219418
180 0.5663491085701059
190 0.5665434719890234
200 0.5664790406204869
210 0.5662848659039177
220 0.5664136867071357
230 0.5672560749576048
240 0.5649892565462942
250 0.5659605704346193
260 0.5655073283605372
270 0.5652483448707347
280 0.5642765697100023
290 0.5644062291903242
300 0.5662845094661485
310 0.5665433461874578
320 0.5659605494676916
330 0.567191391985937
340 0.5666730266347075
350 0.5650539185510344
360 0.5655073493274648
370 0.565377941450274
380 0.5649890468770181
390 0.5673856505702165
400 0.5645354693320943
410 0.5674502916080291
420 0.5655075589967409
430 0.5664137286409908
440 0

In [None]:
cnt = pd.read_csv('outputs/output_0.csv')
print(cnt)
# for i in np.arange(0, 1000, 10):
#     out = pd.read_csv('outputs/output_' + str(i) + '.csv')