# Kaggle

## Import

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample

## Set Random Seed

In [None]:
seed = 42
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(seed)

## Data Analysis

In [None]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','name','description','picture_url','host_id','host_name',
                'neighbourhood_cleansed','property_type','calendar_last_scraped']
train_data = train_data.drop(drop_columns, axis=1)
test_data = test_data.drop(drop_columns, axis=1)

# delete incomplete data
train_data = train_data.dropna()

binary = 0
if binary:
    # binary encode room_type
    binary_encoder = BinaryEncoder(cols=['room_type'])
    binary_encoded_data = binary_encoder.fit_transform(train_data['room_type'])
    for i in binary_encoded_data.columns:
        train_data[i] = binary_encoded_data[i]
    train_data = train_data.drop(['room_type'], axis=1)
    
    # binary encode neighbourhood_group_cleansed
    binary_encoder = BinaryEncoder(cols=['neighbourhood_group_cleansed'])
    binary_encoded_data = binary_encoder.fit_transform(train_data['neighbourhood_group_cleansed'])
    for i in binary_encoded_data.columns:
        train_data[i] = binary_encoded_data[i]
    train_data = train_data.drop(['neighbourhood_group_cleansed'], axis=1)
else:
    # label encode room_type
    train_data.loc[train_data['room_type'] == 'Shared room', 'room_type'] = 0
    train_data.loc[train_data['room_type'] == 'Entire home/apt', 'room_type'] = 1
    train_data.loc[train_data['room_type'] == 'Private room', 'room_type'] = 2
    train_data.loc[train_data['room_type'] == 'Hotel room', 'room_type'] = 3
    train_data['room_type'] = train_data['room_type'].astype('float64')

    test_data.loc[test_data['room_type'] == 'Shared room', 'room_type'] = 0
    test_data.loc[test_data['room_type'] == 'Entire home/apt', 'room_type'] = 1
    test_data.loc[test_data['room_type'] == 'Private room', 'room_type'] = 2
    test_data.loc[test_data['room_type'] == 'Hotel room', 'room_type'] = 3
    test_data['room_type'] = test_data['room_type'].astype('float64')

    # label encode neighbourhood_group_cleansed
    train_data.loc[train_data['neighbourhood_group_cleansed'] == 'Unincorporated Areas', 'neighbourhood_group_cleansed'] = 0
    train_data.loc[train_data['neighbourhood_group_cleansed'] == 'Other Cities', 'neighbourhood_group_cleansed'] = 1
    train_data.loc[train_data['neighbourhood_group_cleansed'] == 'City of Los Angeles', 'neighbourhood_group_cleansed'] = 2
    train_data['neighbourhood_group_cleansed'] = train_data['neighbourhood_group_cleansed'].astype('float64')

    test_data.loc[test_data['neighbourhood_group_cleansed'] == 'Unincorporated Areas', 'neighbourhood_group_cleansed'] = 0
    test_data.loc[test_data['neighbourhood_group_cleansed'] == 'Other Cities', 'neighbourhood_group_cleansed'] = 1
    test_data.loc[test_data['neighbourhood_group_cleansed'] == 'City of Los Angeles', 'neighbourhood_group_cleansed'] = 2
    test_data['neighbourhood_group_cleansed'] = test_data['neighbourhood_group_cleansed'].astype('float64')

# convert host_verifications to onehot
# new_cols = ['email', 'phone', 'work_email']
# for i in new_cols: 
#     train_data[i] = 0
# for i in train_data.index:
#     for item in new_cols:
#         if item in ast.literal_eval(train_data.loc[i, 'host_verifications']):
#             train_data.loc[i, item] = 1
train_data = train_data.drop(['host_verifications'], axis=1)

# for i in new_cols: 
#     test_data[i] = 0
# for i in test_data.index:
#     for item in new_cols:
#         if item in ast.literal_eval(test_data.loc[i, 'host_verifications']):
#             test_data.loc[i, item] = 1
test_data = test_data.drop(['host_verifications'], axis=1)

# convert amenities to onehot
# new_cols = [
#     'Smoke alarm','Wifi','Kitchen','Carbon monoxide alarm','Essentials','Hangers','Hair dryer','Hot water',
#     'Iron','Dishes and silverware','Shampoo','Cooking basics','Refrigerator','Fire extinguisher','Free parking on premises',
#     'Microwave','Bed linens','Dedicated workspace','Heating','Air conditioning','First aid kit','Self check-in','Washer',
#     'TV','Dishwasher','Private entrance','Extra pillows and blankets','Long term stays allowed','Free street parking',
#     'Coffee maker'
# ]
# new_cols = [
#     'Wifi','Kitchen','Hot water','Free parking on premises','Air conditioning'
# ]
# for i in new_cols: 
#     train_data[i] = 0
# for i in train_data.index:
#     for item in new_cols:
#         if item in ast.literal_eval(train_data.loc[i, 'amenities']):
#             train_data.loc[i, item] = 1
train_data = train_data.drop(['amenities'], axis=1)

# for i in new_cols: 
#     test_data[i] = 0
# for i in test_data.index:
#     for item in new_cols:
#         if item in ast.literal_eval(test_data.loc[i, 'amenities']):
#             test_data.loc[i, item] = 1
test_data = test_data.drop(['amenities'], axis=1)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

test_data['host_since'] = pd.to_datetime(test_data['host_since'])
test_data['host_since'] = test_data['host_since'].dt.year + test_data['host_since'].dt.month / 12
test_data['host_since'] = test_data['host_since'].astype('float64')

# extract numbers from bathrooms_text
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
test_data['bathrooms_text'] = test_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')

# fill nan
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())
test_data['bathrooms_text'] = test_data['bathrooms_text'].fillna(test_data['bathrooms_text'].mean())

# convert these features into binary variables (0: f; 1: t)
tran_columns = ['host_is_superhost','host_has_profile_pic','host_identity_verified','has_availability','instant_bookable']
for i in range(len(tran_columns)):
    train_data.loc[train_data[tran_columns[i]] == 't', tran_columns[i]] = 1
    train_data.loc[train_data[tran_columns[i]] == 'f', tran_columns[i]] = 0
    train_data[tran_columns[i]] = train_data[tran_columns[i]].astype('float64')

    test_data.loc[test_data[tran_columns[i]] == 't', tran_columns[i]] = 1
    test_data.loc[test_data[tran_columns[i]] == 'f', tran_columns[i]] = 0
    test_data[tran_columns[i]] = test_data[tran_columns[i]].astype('float64')

# convert all features into float64
for i in train_data.select_dtypes(include=['int64']).columns:
    train_data[i] = train_data[i].astype('float64')

for i in test_data.select_dtypes(include=['int64']).columns:
    test_data[i] = test_data[i].astype('float64')

# advance drop cols
train_data = train_data.drop(['minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights',
                              'minimum_maximum_nights','maximum_maximum_nights','availability_60','availability_90',
                              'host_has_profile_pic','has_availability'], axis=1)

test_data = test_data.drop(['minimum_nights','maximum_nights','minimum_minimum_nights','maximum_minimum_nights',
                            'minimum_maximum_nights','maximum_maximum_nights','availability_60','availability_90',
                            'host_has_profile_pic','has_availability'], axis=1)

# deal with reviews
for i in train_data.index:
    train_data.loc[i, 'number_of_reviews'] += train_data.loc[i, 'number_of_reviews_ltm'] + train_data.loc[i, 'number_of_reviews_l30d']

for i in test_data.index:
    test_data.loc[i, 'number_of_reviews'] += test_data.loc[i, 'number_of_reviews_ltm'] + test_data.loc[i, 'number_of_reviews_l30d']

train_data = train_data.drop("number_of_reviews_ltm", axis=1)
train_data = train_data.drop("number_of_reviews_l30d", axis=1)

test_data = test_data.drop("number_of_reviews_ltm", axis=1)
test_data = test_data.drop("number_of_reviews_l30d", axis=1)

# delete incomplete data
train_data = train_data.dropna()
# print(train_data.isnull().sum())
# print(train_data)

## Deal with Data Imbalance in Features

In [None]:
# # draw hist
# for i in train_data.columns:
#     print(i)
#     train_data[i].hist()
#     plt.show()

# room_type
# for i in ['host_is_superhost','host_is_superhost','host_identity_verified','instant_bookable']:
#     print(train_data[i].value_counts(sort=False))
#     majority, minority = 0, 0
#     if train_data[train_data[i] == 0][i].value_counts(sort=False)[0] > train_data[train_data[i] == 1][i].value_counts(sort=False)[1]:
#         majority = train_data[train_data[i] == 0]
#         minority = train_data[train_data[i] == 1]
#     else:
#         majority = train_data[train_data[i] == 1]
#         minority = train_data[train_data[i] == 0]
#     minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=seed)
#     train_data = pd.concat([majority, minority_upsampled])
#     print(train_data[i].value_counts(sort=False))

# # draw hist
# for i in train_data.columns:
#     print(i)
#     train_data[i].hist()
#     plt.show()

## NN

### Hyperparameters

In [None]:
LR = 0.001
L2 = 1e-3
num_classes = 6
batch_size = 64
EPOCHS = 500
CHECKPOINT_FOLDER = "./saved_model"

### Get train and val loader

In [None]:
# get x and y
x = train_data.drop("price", axis=1)
y = train_data['price'].astype('int64')

# get train and val data
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, train_size=0.9, shuffle=True, stratify=y)

x_train['price'] = y_train

for i in ['host_is_superhost','host_is_superhost','host_identity_verified','instant_bookable']:
    # print(x_train[i].value_counts(sort=False))
    majority, minority = 0, 0
    if x_train[x_train[i] == 0][i].value_counts(sort=False)[0] > x_train[x_train[i] == 1][i].value_counts(sort=False)[1]:
        majority = x_train[x_train[i] == 0]
        minority = x_train[x_train[i] == 1]
    else:
        majority = x_train[x_train[i] == 1]
        minority = x_train[x_train[i] == 0]
    minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=seed)
    x_train = pd.concat([majority, minority_upsampled])
    # print(x_train[i].value_counts(sort=False))

y_train = x_train['price'].astype('int64')
x_train = x_train.drop("price", axis=1)

# send data to gpu
x_train = torch.tensor(x_train.to_numpy()).float().cuda()
y_train = torch.tensor(y_train.to_numpy()).cuda()
x_val = torch.tensor(x_val.to_numpy()).float().cuda()
y_val = torch.tensor(y_val.to_numpy()).cuda()

# normalization
x_train_max, _ = torch.max(x_train, axis=0)
x_train_min, _ = torch.min(x_train, axis=0)
x_train = (x_train - x_train_min) / (x_train_max - x_train_min)
x_val = (x_val - x_train_min) / (x_train_max - x_train_min)

# create datasets
train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)

# create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False)
# print(len(train_dataset), len(val_dataset))

### Model

In [None]:
class MultiClfModel(nn.Module):
    def __init__(self, num_features, num_classes):
        super(MultiClfModel, self).__init__()
        self.layer1 = nn.Linear(num_features, 256)
        self.layer2 = nn.Linear(256, 64)
        self.layer3 = nn.Linear(64, num_classes)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.layer1(x))
        x = self.activation(self.layer2(x))
        x = self.layer3(x)
        return x

### Train

In [None]:
torch.torch.set_default_dtype(torch.float32)
model = MultiClfModel(num_features=train_data.shape[1] - 1, num_classes=num_classes).cuda()
criterion = nn.CrossEntropyLoss()  # because CrossEntropyLoss is used，the output layer don't need softmax
optimizer = Adam(model.parameters(), lr=LR, weight_decay=L2)

best_val_acc = 0
best_val_acc_index = 0
for i in range(EPOCHS):
    # train
    model.train()
    total_examples = 0
    correct_examples = 0
    train_loss = 0
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        correct_examples += (torch.max(output, 1)[1] == targets).sum().item()
        total_examples += inputs.shape[0]
        train_loss += loss.cpu().detach().numpy()      
    avg_loss = train_loss / len(train_loader)
    avg_acc = correct_examples / total_examples
    print("Epoch %d: Training loss: %.4f, Training accuracy: %.4f" %(i, avg_loss, avg_acc))

    # val
    model.eval()
    total_examples = 0
    correct_examples = 0
    val_loss = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            output = model(inputs)
            loss = criterion(output, targets)
            correct_examples += (torch.max(output, 1)[1] == targets).sum().item()
            total_examples += inputs.shape[0]
            val_loss += loss.cpu().detach().numpy()
    avg_loss = val_loss / len(val_loader)
    avg_acc = correct_examples / total_examples
    print("Epoch %d: Validation loss: %.4f, Validation accuracy: %.4f" %(i, avg_loss, avg_acc))

    # save the model checkpoint
    if avg_acc > best_val_acc:
        best_val_acc = avg_acc
        best_val_acc_index = i
        if not os.path.exists(CHECKPOINT_FOLDER):
            os.makedirs(CHECKPOINT_FOLDER)
        print("Saving ...")
        state = {'state_dict': model.state_dict()}
        torch.save(state, os.path.join(CHECKPOINT_FOLDER, 'best_model.bin'))
    print('')
print(f"Best validation accuracy: {best_val_acc:.4f}")
print(f"Best validation accuracy index: {best_val_acc_index:.4f}")

In [None]:
# # test
# y_pred = best_rf_model.predict(test_data)
# test_sub = pd.read_csv("data/test.csv")
# sub_drop_list = test_sub.columns.tolist()
# sub_drop_list.remove('id')
# sub_drop_list = pd.Index(sub_drop_list)
# test_sub = test_sub.drop(sub_drop_list, axis=1)
# test_sub['price'] = pd.DataFrame(y_pred)
# test_sub.to_csv('output.csv', index=False)