In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv(r"listings.csv")
df = df[['price', 'host_is_superhost','host_listings_count',
                      'accommodates', 'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights', 
                      'number_of_reviews', 'review_scores_rating', 'property_type']]
df.columns

Index(['price', 'host_is_superhost', 'host_listings_count', 'accommodates',
       'bathrooms_text', 'beds', 'minimum_nights', 'maximum_nights',
       'number_of_reviews', 'review_scores_rating', 'property_type'],
      dtype='object')

In [27]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

listings = train_set.copy()
# does not include longitude and latitude

In [44]:
def clean_price(price_column):
    #cleans and converts price to strings  
    price_column = price_column.copy()
    price_column = price_column.str.strip('$').str.replace(',', '').reset_index()['price']
    price_column = price_column.str[:-3]
    price_column = pd.to_numeric(price_column, errors='raise')
    return price_column

#listings['price'] = clean_price(listings['price'])

def clean_bathrooms(bathrooms_text):
    #cleans the bathrooms... har har har... no actually it cleans the bathrooms_text column
    bathrooms_text = bathrooms_text.copy()
    pattern = r'(\d.?\d?)\s'
    pattern2 = r'(Half)'
    bathrooms_text.loc[bathrooms_text.str.contains(pattern2, na=False)] = .5
    bathrooms_text = bathrooms_text.str.extract(pattern)
    return bathrooms_text.astype(float)

#listings['bathrooms'] = clean_bathrooms(listings['bathrooms_text'])
#listings.drop(columns='bathrooms_text', inplace=True)


def clean_property_type(property_type_col):
    #consolidate the property_type var to a few common categories 
    if isinstance(property_type_col, pd.DataFrame):
        property_type_col = property_type_col.squeeze() 
     
    property_type_col = property_type_col.copy()
    property_type_col.loc[property_type_col.str.contains(r'Entire')] = 'Entire Unit'
    property_type_col.loc[property_type_col.str.contains(r'Tiny home')] = 'Entire Unit'
    property_type_col.loc[property_type_col.str.contains(r'[Rr]oom')] = 'Single Room'
    property_type_col.loc[property_type_col.str.contains(r'Camp')] = 'Camping'

    property_type_col.loc[~property_type_col.isin(['Camping', 'Single Room', 'Entire Unit'])] = np.nan
    return pd.DataFrame(property_type_col, columns=['property_type'])

#listings['property_type'] = clean_property_type(listings['property_type'])

In [None]:
#listings = listings[(listings['price'] < listings['price'].quantile(.99)) & (listings['price'] > listings['price'].quantile(.01))]

In [46]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

num_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

cat_pipe = make_pipeline(   # host_is_superhost
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary')
)

bathrooms_pipeline = make_pipeline(
    FunctionTransformer(func=clean_bathrooms),
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

property_type_pipeline = make_pipeline(
    FunctionTransformer(func=clean_property_type),
    SimpleImputer(strategy='most_frequent')
)

clean_response_pipeline = make_pipeline(
    FunctionTransformer(func=clean_price),
    SimpleImputer(strategy='mean'),
    FunctionTransformer(np.log),
    StandardScaler()
)

preprocessing = ColumnTransformer([
    ('clean_response', clean_response_pipeline, ['price']),
    ('clean_bathrooms', bathrooms_pipeline, ['bathrooms_text']),
    ('clean_property_type', property_type_pipeline, ['property_type']),
    ('cat', cat_pipe, make_column_selector(dtype_include=object)),
],
remainder=num_pipe)

In [48]:
X = listings.drop(columns=['price']).copy
preprocessing.fit_transform(X)

ValueError: Expected 2D array, got scalar array instead:
array=<bound method NDFrame.copy of      host_is_superhost  host_listings_count  accommodates  bathrooms_text  \
582                  f                 1645             8         2 baths   
1951                 f                    3             4          1 bath   
3291                 t                    8             3          1 bath   
679                  t                    1             5          1 bath   
56                   t                    2             2  1 private bath   
...                ...                  ...           ...             ...   
1095                 t                    1             2          1 bath   
1130                 t                    1             8         2 baths   
1294                 t                    3             2          1 bath   
860                  t                    1             2          1 bath   
3174                 f                    1             4         2 baths   

      beds  minimum_nights  maximum_nights  number_of_reviews  \
582    5.0               2            1125                 41   
1951   2.0               2            1125                  4   
3291   2.0               1             365                  5   
679    3.0               2              21                401   
56     1.0               1            1125               1130   
...    ...             ...             ...                ...   
1095   2.0               2              28                150   
1130   4.0               2              90                155   
1294   2.0               2              14                326   
860    1.0               2               3                151   
3174   2.0               2             365                  6   

      review_scores_rating         property_type  
582                   4.79          Entire cabin  
1951                  5.00    Entire guest suite  
3291                  5.00             Tiny home  
679                   4.98    Entire rental unit  
56                    4.90  Private room in home  
...                    ...                   ...  
1095                  4.91     Entire guesthouse  
1130                  4.87          Entire cabin  
1294                  4.99             Tiny home  
860                   4.95    Entire guest suite  
3174                  5.00           Entire home  

[2663 rows x 10 columns]>.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [30]:
X = preprocessing.fit_transform(listings)

AttributeError: 'DataFrame' object has no attribute 'str'

*0. remove outliers*
1. Response variable(price): (clean_price function applied to price column, log transformation applied to price column.
2. clean_bathrooms applied to the bathrooms variable
3. clean_property_type to property_type column
4. numeric columns imputed and scaled (num_pipeline)
5. cat columns imputed (most frequent) and one hot encoded (cat_pipeline)

In [None]:
from sklearn.ensemble import IsolationForest

rem_outliers = IsolationForest()

outliers_array = rem_outliers.fit_predict(data)
data = data.iloc[outliers_array == 1]

def remove_outliers(data, outlier_array):
    return data.iloc[outlier_array == 1]