In [249]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(r"data\asheville.csv")
listings = df[['price', 'host_is_superhost','host_listings_count',
                      'accommodates', 'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights', 
                      'number_of_reviews', 'review_scores_rating', 'property_type']]
listings.columns

Index(['price', 'host_is_superhost', 'host_listings_count', 'accommodates',
       'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights',
       'number_of_reviews', 'review_scores_rating', 'property_type'],
      dtype='object')

In [250]:
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [290]:
df['reviews_per_month'].value_counts()

reviews_per_month
1.00    33
0.15    19
0.23    17
2.00    17
0.26    15
        ..
4.16     1
9.14     1
6.63     1
8.19     1
7.00     1
Name: count, Length: 728, dtype: int64

host_since, host_response_time, host_response_rate, host_acceptance_rate, neighbourhood_cleansed (this is zip code), latitude, longitude, room_type, 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights','maximum_maximum_nights', 'minimum_nights_avg_ntm','maximum_nights_avg_ntm', 'availability_30', 'availability_60', 'availability_90','availability_365', number_of_reviews_ltm, number_of_reviews_l30d, reviews_per_month

In [131]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(listings, test_size=0.2, random_state=42)

listings = train_set.copy()
# does not include longitude and latitude

In [218]:
def clean_price(df):
    #converts price column to floats
    col_name = 'price'
    df = df.copy()
    df[col_name] = df[col_name].str.replace('$','').str.replace(',','')
    df[col_name] = df[col_name].astype(float)
    return df


def clean_bathrooms(df):
    #cleans the bathrooms... har har har... no actually it cleans the bathrooms_text column
    col = 'bathrooms_text'
    df = df.copy()
    pattern = r'(\d.?\d?)\s'
    pattern2 = r'half'
    df.loc[df[col].str.contains(pattern2, case=False, na=False), col] = '0.5 '
    df[col] = df[col].str.extract(pattern)
    df[col] = df[col].astype(float)
    return df


def clean_property_type(df):
    col_name = 'property_type'
    df = df.copy()
    df.loc[df[col_name].str.contains(r'Entire|Tiny home', case=False, na=False), col_name] = 'Entire Unit'
    df.loc[df[col_name].str.contains(r'Shared', case=False, na=False), col_name] = 'Shared Space'
    df.loc[df[col_name].str.contains(r'[Rr]oom', case=False, na=False), col_name] = 'Private Room'
    df.loc[df[col_name].str.contains(r'Camp', case=False, na=False), col_name] = 'Camping Space'

    good_labels = ['Entire Unit', 'Private Room', 'Shared Space', 'Camping Space']
    df.loc[~df[col_name].isin(good_labels), col_name] = 'Other'
    return df



In [237]:
def clean_airbnb(df):
    df = df.copy()
    df = clean_bathrooms(df)
    df = clean_property_type(df)
    return df

In [238]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary')
)

clean_price_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    FunctionTransformer(np.log1p)
)

mean_imputer = SimpleImputer(strategy='mean')

preprocessing = ColumnTransformer([
    ('cat', cat_pipe, make_column_selector(dtype_include=object)),
    ('num', num_pipe, make_column_selector(dtype_include=np.number))
],
remainder='drop')

In [245]:
y = listings[['price']]
X = listings.drop(columns='price')

In [240]:
y = clean_price(y)
y = mean_imputer.fit_transform(y)

In [246]:
X = clean_airbnb(X)

In [242]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

model = make_pipeline(
    preprocessing,
    TransformedTargetRegressor(regressor=LinearRegression(), func=np.log1p, inverse_func=np.expm1)
)


In [243]:
model.fit(X, y)

In [244]:
model.score(X,y)

0.12679507314543037

In [None]:
from sklearn.ensemble import IsolationForest

rem_outliers = IsolationForest()

outliers_array = rem_outliers.fit_predict(data)
data = data.iloc[outliers_array == 1]

def remove_outliers(data, outlier_array):
    return data.iloc[outlier_array == 1]