# Kaggle

## Import

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

## Set Random Seed

In [2]:
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(666)

## Data Analysis

In [3]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name','name']
train_data = train_data.drop(drop_columns, axis=1)

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)

# label encode
label_encoder = LabelEncoder()
train_data['host_is_superhost'] = label_encoder.fit_transform(train_data['host_is_superhost'])

# onehot encode
categorical_columns = train_data.select_dtypes(include=['object']).columns
categorical_columns_to_encode = [col for col in categorical_columns if len(train_data[col].unique()) <= 20]
train_data = pd.get_dummies(train_data, columns=categorical_columns_to_encode, drop_first=True)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))
train_data = train_data.drop('amenities', axis=1)

train_data = pd.get_dummies(train_data, columns=['property_type','neighbourhood_cleansed'], drop_first=True)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# extract numbers from bathrooms_text
# train_data['bathrooms_shared'] = train_data['bathrooms_text'].isin(['shared', 'Shared'])
# train_data['bathrooms_private'] = train_data['bathrooms_text'].isin(['private', 'Private'])
# train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='half', value='0.5', regex=True)
# train_data['bathrooms_text'] = train_data['bathrooms_text'].replace(to_replace='Half', value='0.5', regex=True)
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

## Random Forest

### CV

In [None]:
# params to tune
param_grid = {
    'n_estimators': [500],
    'max_depth': [None]
}

c = 0
for seed in np.arange(0, 2000, 1):
    set_all_seeds(int(seed))
    
    # get train data for RF
    x_train = train_data.drop("price", axis=1)
    y_train = train_data['price'].astype('int64')
    
    rf = RandomForestClassifier(random_state=seed)
    grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search_rf.fit(x_train, y_train)
    print(seed, grid_search_rf.best_score_)
    if grid_search_rf.best_score_>0.567:
        # get train data for RF
        x_train = train_data.drop("price", axis=1)
        y_train = train_data['price'].astype('int64')
        
        # deal with imbalanced data
        smote = SMOTE(random_state=seed)
        x_train, y_train = smote.fit_resample(x_train, y_train)
        # print(Counter(y_train))
        
        rf = RandomForestClassifier(random_state=seed, n_estimators=500, max_depth=None)
        rf.fit(x_train, y_train)
        
        # test
        y_pred = rf.predict(test_data)
        test_sub = pd.read_csv("data/test.csv")
        sub_drop_list = test_sub.columns.tolist()
        sub_drop_list.remove('id')
        sub_drop_list = pd.Index(sub_drop_list)
        test_sub = test_sub.drop(sub_drop_list, axis=1)
        test_sub['price'] = pd.DataFrame(y_pred)
        test_sub.to_csv('outputs/output_' + str(seed) + '.csv', index=False)
        print('saved...', c)
        c += 1
    if c == 100:
        break

0 0.5631760366468348
1 0.5633705887681006
2 0.5636940874941502
3 0.5638884509130677
4 0.5638883670453574
5 0.5619448586577477
6 0.5635643022122625


In [None]:
import os
files = os.listdir('outputs')
cnt = pd.read_csv(files[0])
cnt['price'] = -1
cnt['0'] = 0
cnt['1'] = 0
cnt['2'] = 0
cnt['3'] = 0
cnt['4'] = 0
cnt['5'] = 0
print(cnt)
for i in files:
    out = pd.read_csv(i)
    for j in out.index:
        cnt[str(out['price'][j])][j]+=1
print(cnt)

In [None]:
for j in cnt.index:
    cnt['price'][j] = np.argmax([cnt['0'][j], cnt['1'][j],cnt['2'][j],cnt['3'][j],cnt['4'][j],cnt['5'][j]])
print(cnt)

In [None]:
cnt = cnt.drop('0', axis=1)
cnt = cnt.drop('1', axis=1)
cnt = cnt.drop('2', axis=1)
cnt = cnt.drop('3', axis=1)
cnt = cnt.drop('4', axis=1)
cnt = cnt.drop('5', axis=1)
print(cnt)

In [None]:
cnt.to_csv('output.csv', index=False)