# Kaggle

## Import

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

## Set Random Seed

In [2]:
seed = 666
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(seed)

## Data Analysis

In [3]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name','name']
train_data = train_data.drop(drop_columns, axis=1)

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))
train_data = train_data.drop('amenities', axis=1)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# # extract numbers from bathrooms_text
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

# label encode
label_encoder = LabelEncoder()
categorical_columns = train_data.select_dtypes(include=['object']).columns
for i in categorical_columns:
    train_data[i] = label_encoder.fit_transform(train_data[i])

train_data = train_data.drop(['property_type'], axis=1)

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

## SVM

In [4]:
# get train data for SVM
x_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')
# print(x_train)

# params to tune
param_grid = {
    'svm__C': [1],
    'svm__kernel': ['rbf'],
    'svm__gamma': ['auto']
}

# normalization
pipe = Pipeline([
    ('scaler', StandardScaler()), 
    ('svm', SVC(random_state=seed))
])

grid_search_svm = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search_svm.fit(x_train, y_train)
print(grid_search_svm.best_params_)
print(grid_search_svm.best_score_)
best_svm_model = grid_search_svm.best_estimator_
# 0.5185540326111204

        host_since  host_is_superhost  host_listings_count  \
0      2018.583333                  0                  5.0   
1      2013.583333                  1                  3.0   
2      2019.500000                  0                118.0   
3      2021.916667                  0                  5.0   
4      2023.166667                  0               1020.0   
...            ...                ...                  ...   
15436  2020.750000                  0                 23.0   
15437  2018.583333                  0                 27.0   
15438  2015.833333                  1                  3.0   
15439  2019.333333                  0                 54.0   
15440  2015.750000                  0                  1.0   

       host_total_listings_count  host_verifications  host_has_profile_pic  \
0                            7.0                   1                     1   
1                            3.0                   1                     1   
2                    

In [5]:
# # test
# y_pred = best_rf_model.predict(test_data)
# test_sub = pd.read_csv("data/test.csv")
# sub_drop_list = test_sub.columns.tolist()
# sub_drop_list.remove('id')
# sub_drop_list = pd.Index(sub_drop_list)
# test_sub = test_sub.drop(sub_drop_list, axis=1)
# test_sub['price'] = pd.DataFrame(y_pred)
# test_sub.to_csv('output.csv', index=False)