In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(r"listings.csv")
listings = df[['price', 'host_is_superhost','host_listings_count',
                      'accommodates', 'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights', 
                      'number_of_reviews', 'review_scores_rating', 'property_type']]
listings.columns

Index(['price', 'host_is_superhost', 'host_listings_count', 'accommodates',
       'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights',
       'number_of_reviews', 'review_scores_rating', 'property_type'],
      dtype='object')

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

listings = train_set.copy()
# does not include longitude and latitude

In [27]:
def clean_price(df):
    #converts price column to floats
    col_name = 'price'
    df = df.copy()
    df[col_name] = df[col_name].str.replace('$','').str.replace(',','')
    df[col_name] = df[col_name].astype(float)
    return df

#listings['price'] = clean_price(listings['price'])

def clean_bathrooms(df):
    #cleans the bathrooms... har har har... no actually it cleans the bathrooms_text column - DOES NOT CURRENTLY WORK
    col = 'bathrooms_text'
    df = df.copy()
    pattern = r'(\d.?\d?)\s'
    pattern2 = r'(half)'
    df.loc[df[col].str.contains(pattern2, case=False, na=False)] = 0.5
    df[col] = df[col].str.extract(pattern)
    df[col] = df[col].astype(float)
    return df

#listings['bathrooms'] = clean_bathrooms(listings['bathrooms_text'])
#listings.drop(columns='bathrooms_text', inplace=True)


def clean_property_type(df):
    col_name = 'property_type'
    df = df.copy()
    df[col_name] = df[col_name].str.replace(r'Entire|Tiny home', 'Entire Unit', regex=True)
    df[col_name] = df[col_name].str.replace(r'[Rr]oom', 'Single Room', regex=True)
    df[col_name] = df[col_name].str.replace(r'Camp', 'Camping', regex=True)
    df[col_name] = df[col_name].where(df[col_name].isin(['Camping', 'Single Room', 'Entire Unit']), np.nan)
    return df

#listings['property_type'] = clean_property_type(listings['property_type'])

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary')
)

clean_price_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    FunctionTransformer(np.log1p)
)

X_preprocessing = ColumnTransformer([
    ('cat', cat_pipe, make_column_selector(dtype_include=object)),
    ('num', num_pipe, make_column_selector(dtype_include=np.number))
],
remainder='drop')

*0. remove outliers*
1. Response variable(price): (clean_price function applied to price column, log transformation applied to price column.
2. clean_bathrooms applied to the bathrooms variable
3. clean_property_type to property_type column
4. numeric columns imputed and scaled (num_pipeline)
5. cat columns imputed (most frequent) and one hot encoded (cat_pipeline)

In [None]:
from sklearn.ensemble import IsolationForest

rem_outliers = IsolationForest()

outliers_array = rem_outliers.fit_predict(data)
data = data.iloc[outliers_array == 1]

def remove_outliers(data, outlier_array):
    return data.iloc[outlier_array == 1]