In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(r"data\asheville.csv")
listings = df[['price', 'host_is_superhost','host_listings_count',
                      'accommodates', 'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights', 
                      'number_of_reviews', 'review_scores_rating', 'property_type']]
listings.columns

Index(['price', 'host_is_superhost', 'host_listings_count', 'accommodates',
       'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights',
       'number_of_reviews', 'review_scores_rating', 'property_type'],
      dtype='object')

In [4]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(listings, test_size=0.2, random_state=42)

listings = train_set.copy()
# does not include longitude and latitude

In [5]:
def clean_price(df):
    #converts price column to floats
    col_name = 'price'
    df = df.copy()
    df[col_name] = df[col_name].str.replace('$','').str.replace(',','')
    df[col_name] = df[col_name].astype(float)
    return df

#listings['price'] = clean_price(listings['price'])

def clean_bathrooms(df):
    #cleans the bathrooms... har har har... no actually it cleans the bathrooms_text column - DOES NOT CURRENTLY WORK
    col = 'bathrooms_text'
    df = df.copy()
    pattern = r'(\d.?\d?)\s'
    pattern2 = r'(half)'
    df[col].loc[df[col].str.contains(pattern2, case=False, na=False)] = 0.5
    df[col] = df[col].str.extract(pattern)
    df[col] = df[col].astype(float)
    return df

#listings['bathrooms'] = clean_bathrooms(listings['bathrooms_text'])
#listings.drop(columns='bathrooms_text', inplace=True)


def clean_property_type(df):
    col_name = 'property_type'
    df = df.copy()
    df[col_name] = df[col_name].str.replace(r'Entire|Tiny home', 'Entire Unit', regex=True)
    df[col_name] = df[col_name].str.replace(r'[Rr]oom', 'Single Room', regex=True)
    df[col_name] = df[col_name].str.replace(r'Camp', 'Camping', regex=True)
    df[col_name] = df[col_name].where(df[col_name].isin(['Camping', 'Single Room', 'Entire Unit']), np.nan)
    return df

#listings['property_type'] = clean_property_type(listings['property_type'])

In [6]:
def clean_airbnb(df):
    df = df.copy()
    df = clean_bathrooms(df)
    df = clean_property_type(df)
    return df

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary')
)

clean_price_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    FunctionTransformer(np.log1p)
)

mean_imputer = SimpleImputer(strategy='mean')

preprocessing = ColumnTransformer([
    ('cat', cat_pipe, make_column_selector(dtype_include=object)),
    ('num', num_pipe, make_column_selector(dtype_include=np.number))
],
remainder='drop')

In [35]:
y = listings[['price']]
X = listings.drop(columns='price')

In [36]:
y = clean_price(y)
y = mean_imputer.fit_transform(y)

In [10]:
X = clean_airbnb(X)

  df[col].loc[df[col].str.contains(pattern2, case=False, na=False)] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col].loc[df[col].str.contains(pattern2, case=False, na=False)] = 0.5


In [29]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

model = make_pipeline(
    preprocessing,
    PolynomialFeatures(degree=3),
    TransformedTargetRegressor(regressor=LinearRegression(), func=np.log1p, inverse_func=np.expm1)
)


In [30]:
model.fit(X, y)

In [31]:
model.score(X,y)

0.16312961159383088

In [None]:
from sklearn.ensemble import IsolationForest

rem_outliers = IsolationForest()

outliers_array = rem_outliers.fit_predict(data)
data = data.iloc[outliers_array == 1]

def remove_outliers(data, outlier_array):
    return data.iloc[outlier_array == 1]