In [1]:
# ---- Imports ----
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from xgboost import XGBRegressor, plot_importance

from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# ---- Load datasets ----

train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

excluded_features = [
    'price', 'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin', 
    'review_scores_communication', 'review_scores_location', 
    'review_scores_value', 'reviews_per_month', 
    'host_response_rate', 'host_response_time_encoded',
    'host_acceptance_rate', 'description', 'neighborhood_cleansed'
    'neighborhood_group_cleansed'
]

In [None]:
# ---- Pipeline Initialization ----

class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, current_year=2024):
        self.current_year = current_year # in case data was collected in different year
        self.response_time_mapping = {
            'within an hour': 4,
            'within a few hours': 3,
            'within a day': 2,
            'a few days or more': 1
        }
        