In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(r"data\austin.csv")

In [18]:
listings = df[['price', 'host_is_superhost','host_listings_count',
                      'accommodates', 'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights', 
                      'number_of_reviews', 'review_scores_rating', 'property_type']]
listings.columns

Index(['price', 'host_is_superhost', 'host_listings_count', 'accommodates',
       'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights',
       'number_of_reviews', 'review_scores_rating', 'property_type'],
      dtype='object')

In [19]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(listings, test_size=0.2, random_state=42)

listings = train_set.copy()
# does not include longitude and latitude

In [21]:
def clean_price(df):
    #converts price column to floats
    col_name = 'price'
    df = df.copy()
    df[col_name] = df[col_name].str.replace('$','').str.replace(',','')
    df[col_name] = df[col_name].astype(float)
    return df


def clean_bathrooms(df):
    #cleans the bathrooms... har har har... no actually it cleans the bathrooms_text column
    col = 'bathrooms_text'
    df = df.copy()
    pattern = r'(\d.?\d?)\s'
    pattern2 = r'half'
    df.loc[df[col].str.contains(pattern2, case=False, na=False), col] = '0.5 '
    df[col] = df[col].str.extract(pattern)
    df[col] = df[col].astype(float)
    return df



def clean_property_type(df):
    col_name = 'property_type'
    df = df.copy()
    df.loc[df[col_name].str.contains(r'Entire|Tiny home', case=False, na=False), col_name] = 'Entire Unit'
    df.loc[df[col_name].str.contains(r'Shared', case=False, na=False), col_name] = 'Shared Space'
    df.loc[df[col_name].str.contains(r'[Rr]oom', case=False, na=False), col_name] = 'Private Room'
    df.loc[df[col_name].str.contains(r'Camp', case=False, na=False), col_name] = 'Camping Space'

    good_labels = ['Entire Unit', 'Private Room', 'Shared Space', 'Camping Space']
    df.loc[~df[col_name].isin(good_labels), col_name] = 'Other'
    return df



In [22]:
def clean_airbnb(df):
    df = df.copy()
    df = clean_bathrooms(df)
    df = clean_property_type(df)
    return df

In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

cat_pipe = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary')
)

clean_price_pipeline = make_pipeline(
    SimpleImputer(strategy='mean'),
    FunctionTransformer(np.log1p)
)

mean_imputer = SimpleImputer(strategy='mean')

preprocessing = ColumnTransformer([
    ('cat', cat_pipe, make_column_selector(dtype_include=object)),
    ('num', num_pipe, make_column_selector(dtype_include=np.number))
],
remainder='drop')

In [24]:
y = listings[['price']]
X = listings.drop(columns='price')

In [25]:
y = clean_price(y)
y = mean_imputer.fit_transform(y)

In [26]:
X = clean_airbnb(X)

In [29]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

model = make_pipeline(
    preprocessing,
    TransformedTargetRegressor(regressor=LinearRegression(), func=np.log1p, inverse_func=np.expm1)
)

In [32]:
model = make_pipeline(
    preprocessing,
    LinearRegression()
)

In [33]:
model.fit(X, y)

In [34]:
model.score(X,y)

0.039834774185391386