# Kaggle

## Import

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

## Set Random Seed

In [2]:
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(666)

## Data Analysis

In [3]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name','name']
train_data = train_data.drop(drop_columns, axis=1)

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)

# # label encode
# label_encoder = LabelEncoder()
# train_data['host_is_superhost'] = label_encoder.fit_transform(train_data['host_is_superhost'])
# train_data['host_has_profile_pic'] = label_encoder.fit_transform(train_data['host_has_profile_pic'])
# train_data['host_identity_verified'] = label_encoder.fit_transform(train_data['host_identity_verified'])
# train_data['has_availability'] = label_encoder.fit_transform(train_data['has_availability'])
# train_data['instant_bookable'] = label_encoder.fit_transform(train_data['instant_bookable'])

# onehot encode
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns_to_encode = [col for col in categorical_columns if len(train_data[col].unique()) <= 20]
train_data = pd.get_dummies(train_data, columns=categorical_columns_to_encode, drop_first=True)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))
train_data = train_data.drop('amenities', axis=1)

train_data = pd.get_dummies(train_data, columns=['property_type','neighbourhood_cleansed'], drop_first=True)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# extract numbers from bathrooms_text
# train_data['bathrooms_shared'] = train_data['bathrooms_text'].isin(['shared', 'Shared'])
# train_data['bathrooms_private'] = train_data['bathrooms_text'].isin(['private', 'Private'])
# train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='half', value='0.5', regex=True)
# train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='Half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

# advance drop cols
train_data = train_data.drop(['minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights',
                              'minimum_maximum_nights','maximum_maximum_nights','availability_60','availability_90',
                              'number_of_reviews_ltm','number_of_reviews_l30d'], axis=1)

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

## Random Forest

### CV

In [4]:
# params to tune
param_grid = {
    'n_estimators': [500],
    'max_depth': [None]
}

c = 0
for seed in np.arange(0, 2000, 1):
    set_all_seeds(int(seed))
    
    # get train data for RF
    x_train = train_data.drop("price", axis=1)
    y_train = train_data['price'].astype('int64')
    
    rf = RandomForestClassifier(random_state=seed)
    grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search_rf.fit(x_train, y_train)
    print(seed, grid_search_rf.best_score_)
    if grid_search_rf.best_score_>0.5745:
        # get train data for RF
        x_train = train_data.drop("price", axis=1)
        y_train = train_data['price'].astype('int64')
        
        # deal with imbalanced data
        smote = SMOTE(random_state=seed)
        x_train, y_train = smote.fit_resample(x_train, y_train)
        # print(Counter(y_train))
        
        rf = RandomForestClassifier(random_state=seed, n_estimators=500, max_depth=None)
        rf.fit(x_train, y_train)
        
        # test
        y_pred = rf.predict(test_data)
        test_sub = pd.read_csv("data/test.csv")
        sub_drop_list = test_sub.columns.tolist()
        sub_drop_list.remove('id')
        sub_drop_list = pd.Index(sub_drop_list)
        test_sub = test_sub.drop(sub_drop_list, axis=1)
        test_sub['price'] = pd.DataFrame(y_pred)
        test_sub.to_csv('outputs/output_' + str(seed) + '.csv', index=False)
        print('saved...', c)
        c += 1
    if c == 100:
        break

0 0.5685508456381243
1 0.5713357358636781
saved... 0
2 0.5711416030809643
saved... 1
3 0.569263008301226
4 0.5728253941363052
saved... 2
5 0.5714007962400428
saved... 3
6 0.5715305395880754
saved... 4
7 0.5705588482950533
8 0.5713358826321713
saved... 5
9 0.5741202486845349
saved... 6
10 0.5723719214260194
saved... 7
11 0.5723073432889898
saved... 8
12 0.5720482968984043
saved... 9
13 0.5723072594212792
saved... 10
14 0.5718537028432832
saved... 11
15 0.5723720052937299
saved... 12
16 0.5737966241569199
saved... 13
17 0.5694577491248405
18 0.573213869371009
saved... 14
19 0.5731491654324137
saved... 15
20 0.5704289372115999
21 0.573278489441894
saved... 16
22 0.5732785104088215
saved... 17
23 0.5710765427045995
saved... 18
24 0.5721777886433056
saved... 19
25 0.5732787829788805
saved... 20
26 0.5728900980749005
saved... 21
27 0.572307007818148
saved... 22
28 0.5721127911677237
saved... 23
29 0.5723722568968611
saved... 24
30 0.5730193591835981
saved... 25
31 0.571724525602296
saved... 

In [5]:
import os
files = os.listdir('outputs')
cnt = pd.read_csv('outputs/'+files[0])
cnt['price'] = -1
cnt['0'] = 0
cnt['1'] = 0
cnt['2'] = 0
cnt['3'] = 0
cnt['4'] = 0
cnt['5'] = 0
print(cnt)
for i in files:
    out = pd.read_csv('outputs/'+i)
    for j in out.index:
        cnt[str(out['price'][j])][j]+=1
print(cnt)

        id  price  0  1  2  3  4  5
0        0     -1  0  0  0  0  0  0
1        1     -1  0  0  0  0  0  0
2        2     -1  0  0  0  0  0  0
3        3     -1  0  0  0  0  0  0
4        4     -1  0  0  0  0  0  0
...    ...    ... .. .. .. .. .. ..
6286  6286     -1  0  0  0  0  0  0
6287  6287     -1  0  0  0  0  0  0
6288  6288     -1  0  0  0  0  0  0
6289  6289     -1  0  0  0  0  0  0
6290  6290     -1  0  0  0  0  0  0

[6291 rows x 8 columns]
        id  price    0  1   2    3   4    5
0        0     -1    0  0   2   87  11    0
1        1     -1    0  2  78   20   0    0
2        2     -1    0  0   0  100   0    0
3        3     -1    0  0   0   28  72    0
4        4     -1    0  0   3   97   0    0
...    ...    ...  ... ..  ..  ...  ..  ...
6286  6286     -1    0  0   0    0   0  100
6287  6287     -1    0  0   0    0   0  100
6288  6288     -1   97  3   0    0   0    0
6289  6289     -1  100  0   0    0   0    0
6290  6290     -1    0  0  61   39   0    0

[6291 rows x 8

In [6]:
for j in cnt.index:
    cnt['price'][j] = np.argmax([cnt['0'][j], cnt['1'][j],cnt['2'][j],cnt['3'][j],cnt['4'][j],cnt['5'][j]])
print(cnt)

        id  price    0  1   2    3   4    5
0        0      3    0  0   2   87  11    0
1        1      2    0  2  78   20   0    0
2        2      3    0  0   0  100   0    0
3        3      4    0  0   0   28  72    0
4        4      3    0  0   3   97   0    0
...    ...    ...  ... ..  ..  ...  ..  ...
6286  6286      5    0  0   0    0   0  100
6287  6287      5    0  0   0    0   0  100
6288  6288      0   97  3   0    0   0    0
6289  6289      0  100  0   0    0   0    0
6290  6290      2    0  0  61   39   0    0

[6291 rows x 8 columns]


In [7]:
cnt = cnt.drop('0', axis=1)
cnt = cnt.drop('1', axis=1)
cnt = cnt.drop('2', axis=1)
cnt = cnt.drop('3', axis=1)
cnt = cnt.drop('4', axis=1)
cnt = cnt.drop('5', axis=1)
print(cnt)

        id  price
0        0      3
1        1      2
2        2      3
3        3      4
4        4      3
...    ...    ...
6286  6286      5
6287  6287      5
6288  6288      0
6289  6289      0
6290  6290      2

[6291 rows x 2 columns]


In [8]:
cnt.to_csv('output.csv', index=False)