# Kaggle

## Import

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
import random
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from category_encoders import BinaryEncoder
import ast
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.utils import resample
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from pygam import LogisticGAM, s, l, f, te
from pygam.terms import TermList
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer

## Set Random Seed

In [2]:
seed = 42
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(seed)

## Data Analysis

In [3]:
# get data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

# comcat train and test data for data process
test_data['price'] = -1
train_data = pd.concat([train_data, test_data], axis=0)

# initial removal of unwanted features
drop_columns = ['id','scrape_id','last_scraped','picture_url','host_id','host_name','name']
train_data = train_data.drop(drop_columns, axis=1)

# deal with incomplete data
categorical_columns_with_nans = ['host_is_superhost', 'bathrooms_text']
imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_columns_with_nans] = imputer.fit_transform(train_data[categorical_columns_with_nans])

beds_imputer = SimpleImputer(strategy='median')
train_data['beds'] = beds_imputer.fit_transform(train_data[['beds']])

train_data['description'] = train_data['description'].fillna('')

# add feature description_length
train_data['description_length'] = train_data['description'].apply(len)
train_data = train_data.drop('description', axis=1)

# add feature amenities_count
train_data['amenities_count'] = train_data['amenities'].apply(lambda x: len(x.split(',')))
train_data = train_data.drop('amenities', axis=1)

# only keep year info of host_since
train_data['host_since'] = pd.to_datetime(train_data['host_since'])
train_data['host_since'] = train_data['host_since'].dt.year + train_data['host_since'].dt.month / 12
train_data['host_since'] = train_data['host_since'].astype('float64')

# # extract numbers from bathrooms_text
train_data['bathrooms_text'] = train_data['bathrooms_text'].str.extract('(\d+\.?\d*)').astype('float64')
train_data['bathrooms_text'] = train_data['bathrooms_text'].fillna(train_data['bathrooms_text'].mean())

# label encode
label_encoder = LabelEncoder()
categorical_columns = train_data.select_dtypes(include=['object']).columns
for i in categorical_columns:
    train_data[i] = label_encoder.fit_transform(train_data[i])

train_data = train_data.drop(['property_type'], axis=1)

# split test and train data
test_data = train_data[train_data['price'] == -1].drop(columns=['price'])
train_data = train_data[train_data['price'] != -1]

## GAM

In [4]:
# get train data for GAM
X_train = train_data.drop("price", axis=1)
y_train = train_data['price'].astype('int64')
X_test = test_data

# Since PyGAM does not handle multiclass classification out of the box, we need to binarize the labels
lb = LabelBinarizer()
y_train_binarized = lb.fit_transform(y_train)

# We have to fit a separate model for each class
gam_models = []
for i in range(y_train_binarized.shape[1]):
    terms = []
    for index in range(len(X_train.columns)):
        if X_train.dtypes[index] == 'float64':
            terms.append(s(index))
        if X_train.dtypes[index] == 'int32':
            terms.append(l(index))
        if X_train.dtypes[index] == 'int64':
            terms.append(s(index))
    if terms:
        terms = TermList(*terms)
    gam = LogisticGAM(terms=terms)
    gam.fit(X_train, y_train_binarized[:, i])
    gam_models.append(gam)

# Now we predict the probability of each class for the test set
y_pred_probabilities = []
for model in gam_models:
    y_pred_probabilities.append(model.predict_proba(X_train))

# Transpose to get the class probabilities in the correct shape
y_pred_probabilities = np.array(y_pred_probabilities).T

# Select the class with the highest probability
y_pred = np.argmax(y_pred_probabilities, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_train, y_pred)
print(f"Accuracy: {accuracy}")

  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  self.link.gradient(mu, self.distribution) ** 2
  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  self.link.gradient(mu, self.distribution) ** 2


Accuracy: 0.5296289100446863


In [5]:
y_pred_probabilities = []
for model in gam_models:
    y_pred_probabilities.append(model.predict_proba(X_test))

# Transpose to get the class probabilities in the correct shape
y_pred_probabilities = np.array(y_pred_probabilities).T

# Select the class with the highest probability
y_pred = np.argmax(y_pred_probabilities, axis=1)

In [6]:
# test
test_sub = pd.read_csv("data/test.csv")
sub_drop_list = test_sub.columns.tolist()
sub_drop_list.remove('id')
sub_drop_list = pd.Index(sub_drop_list)
test_sub = test_sub.drop(sub_drop_list, axis=1)
test_sub['price'] = pd.DataFrame(y_pred)
test_sub.to_csv('output.csv', index=False)