# Feature Engineering

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from pathlib import Path
import re
from collections import Counter
from math import sin, cos, sqrt, atan2, radians

In [2]:
# Read in the data
PROCESSED_PATH = Path('../data/processed/')
listings = pd.read_csv(PROCESSED_PATH/'cleaned_listings.csv')

In [3]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22472 entries, 0 to 22471
Data columns (total 29 columns):
summary                         21535 non-null object
space                           13957 non-null object
description                     22270 non-null object
neighbourhood_overview          11506 non-null object
notes                           7188 non-null object
transit                         12999 non-null object
access                          10807 non-null object
interaction                     10351 non-null object
house_rules                     11395 non-null object
neighbourhood_group_cleansed    22472 non-null object
latitude                        22472 non-null float64
longitude                       22472 non-null float64
is_location_exact               22472 non-null int64
property_type                   22472 non-null object
room_type                       22472 non-null object
accommodates                    22472 non-null int64
bathrooms                   

In this file I will create new features from the existing ones. Features to create:

1. Indicator variables:
    - Neighbourhood overview
    - Interaction
    - House rules
    - Transit
1. Create a total price (price + cleaning fee) as another potential target variable.
1. Distance from different landmarks.
1. Amenities dummy variables and total number of amenities.
1. Square feet from description.
1. Investigate other features to see if any other information could be gained.

## Indicator Variables

In [4]:
def create_indicator_variable(dataframe, col):
    indicator = dataframe[col].notnull().astype('int')
    indicator.name = indicator.name + '_indicator'
    
    return pd.concat([dataframe, indicator], axis=1)

In [5]:
indicator_cols = ['neighbourhood_overview', 'interaction', 'house_rules', 'transit']
for col in indicator_cols:
    listings = create_indicator_variable(listings, col)

## Total Price

I want a total price that includes the cleaning fee, to see if this is a better target variable to use.

In [6]:
listings['total_price'] = listings['price'] + listings['cleaning_fee']

## Distance From Landmarks

I suspect that listings that are within walking distance or closer to the popular tourist spots may be worth more. Using the latitude and logitude of the listing I can find out the distance (as the crow flies) from lots of popular landmarks. I will find the latitude and longitude of multiple different landmarks and compute the km from these landmarks.

In [7]:
def longlat_to_km(origin, destination):
    """
    Function that computes the distance between two (lat, long) points.
    """
    radius = 6371 # km
    lat1, long1 = origin
    lat2, long2 = destination
    
    dlat = radians(lat2 - lat1)
    dlong = radians(long2 - long1)
    
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlong / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = radius * c

    return distance

In [8]:
landmarks = {
    'Centre': (52.5200, 13.4050),
    'Brandenburg_Gate': (52.516266, 13.377775),
    'Berlin_Wall': (52.535152, 13.390206),
    'Reichstag': (52.518589, 13.376665),
    'Museum_Island': (52.516640, 13.402318),
    'Central_Station': (52.524929, 13.369181),
    'Telivision_Tower': (52.520817, 13.409419)
} # Coordinates found from latlong.net
for landmark, coordinates in landmarks.items():
    listings[f'distance_from_{landmark}'] = listings.apply(
        lambda listing: longlat_to_km((listing["latitude"], listing["longitude"]), coordinates), axis=1
    )

Now we can remove the latitude and longitude columns.

In [9]:
listings.drop(['latitude', 'longitude'], axis=1, inplace=True)

## Amenities

Using the amenities feature I will compute dummy variables to indicate if the listing has the amenity and also a total amenities feature to indicate how many amenities the listing includes. Currently our amenities are stored as a string with each amenity seperated by a comma, firstly I will transform my amenities into a set.

In [10]:
listings['amenities'] = listings['amenities'].apply(lambda x: set(re.sub('[{}"]', '', x).split(',')))

First, we will create a feature that tells us the number of amenities a listing offers.

In [11]:
listings['amenities_count'] = listings['amenities'].apply(len)

Now we will create a counter object to get a set of all the different amenities.

In [12]:
amenity_counter = Counter()
listings['amenities'].apply(lambda x: amenity_counter.update(x));

We don't want to include amenities that either have a high occurance or a low occurance in our model as they do not tell us very much. We should only include amenities over some variance threshold.

In [13]:
amenity_prev = [(amenity, c/len(listings)) for amenity, c in amenity_counter.most_common()]
amenity_var = [(amenity, p*(1-p)) for amenity, p in amenity_prev]
var_threshold = 0.8 * (1 - 0.8)
amenities_to_include = [amenity for amenity, var in amenity_var if var > var_threshold]

Now we can create dummy variables for all the amenities we want to include.

In [14]:
for amenity in amenities_to_include:
    listings[amenity] = listings['amenities'].apply(lambda x: np.where(amenity in x, 1, 0))

In [15]:
listings.drop('amenities', axis=1, inplace=True)

## Size

We already have a squared_feet column but this is missing a large amount of information.

In [16]:
100 * listings['square_feet'].isnull().sum() / len(listings)

98.01530793876825

After looking through the descriptions of some of the listings some of the listings give the square meters of the apartment. This is usually in the form __ sqm/square meters. Some of the listings give the square meters and the square feet, we should take the minimum of these values to get sqm. First lets gain as much information from the description as possible.

In [17]:
def find_size(description):
    """
    Returns the size of the listing when the description has a value followed
    by one of the size unit patterns and the value is within acceptable bounds.
    """
    if description is np.nan:
        return np.nan
    pattern = u'(?i)\s\d+\s?(?=sq\w*\s?m|m2|m\u00B2|ms?q|qm|m/2|meters?\s?sq)'
    sizes = [float(x.strip()) for x in re.findall(pattern, description)]
    sizes = list(filter(lambda x: x<=300 and x>10, sizes))
    if not sizes:
        return np.nan
    return min(sizes)

In [18]:
listings['square_meters'] = listings['description'].apply(find_size)

In [19]:
listings['square_meters'].describe()

count    5054.000000
mean       55.087060
std        40.267642
min        11.000000
25%        24.000000
50%        43.500000
75%        75.000000
max       300.000000
Name: square_meters, dtype: float64

We have managed to get a fair amount of information from the description, let's see if any of the other descritive columns contain additional information about the size of the listing.

In [20]:
for col in ['summary', 'space', 'notes']:
    listings['square_meters'] = listings['square_meters'].fillna(listings[col].apply(find_size))
listings['square_meters'].count()

5166

We managed to get a size for an extra 112 listings, now we can use the feet_squared feature to fill missing values and convert to meters squared.

In [21]:
def feet_to_meters(measurement):
    """Return m2 from ft2"""
    return measurement/10.764

In [22]:
def fill_square_meters(listing):
    """
    Convert ft2 feature to m2 if squared_meters is missing and the result is within reasonable bounds.
    """
    if not np.isnan(listing['square_feet']) and listing['square_feet']>0:
        if np.isnan(listing['square_meters']):
            value = feet_to_meters(listing['square_feet'])
            value_conditional = value<250 and value>10
            return value if value_conditional else np.nan
    return listing['square_meters']

In [23]:
listings['square_meters'] = listings.apply(fill_square_meters, axis=1)
listings['square_meters'].count()

5364

We couldn't gain any extra information from the square_feet column, we can drop the squared_feet column now.

In [24]:
listings.drop('square_feet', axis=1, inplace=True)

How much of the square meters column is missing information?

In [25]:
100 * listings['square_meters'].isnull().sum() / len(listings)

76.13029547881808

How important is the information we have gained?

In [26]:
listings.corr()['square_meters'][listings.corr()['square_meters'] > 0.2].sort_values(ascending=False)[1:]

total_price            0.485633
bedrooms               0.457441
price                  0.453087
accommodates           0.432057
beds                   0.370702
cleaning_fee           0.349272
bathrooms              0.346051
guests_included        0.282991
Family/kid friendly    0.252436
amenities_count        0.214925
Name: square_meters, dtype: float64

The square_meters feature has a fairly high correlation with price and total price. This shows it could be very useful to our model. We should try and fill in the missing information using a machine learning algorithm. First lets create dummy columns from columns I think could help.

In [27]:
dummy_cols = ['room_type', 'property_type', 'neighbourhood_group_cleansed']
dummies = pd.get_dummies(listings[dummy_cols], drop_first=True)

predictor_cols = [
    'bedrooms',
    'accommodates',
    'beds',
    'cleaning_fee',
    'bathrooms',
    'square_meters',
    'Family/kid friendly'
]
listings_size_predictors_all = pd.concat([listings[predictor_cols], dummies], axis=1)
listings_size_predictors = listings_size_predictors_all[listings_size_predictors_all['square_meters'].notnull()]

Now lets test some linear models to see which will model our problem best. First lets split our data into a training and test set.

In [28]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold

X = listings_size_predictors.drop('square_meters', axis=1)
y = listings_size_predictors['square_meters']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

##### Lasso

In [29]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

lasso = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=10).fit(X_train, y_train)
predictions = lasso.predict(X_test)
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE', np.sqrt(mean_squared_error(y_test, predictions)))
print('MAE: ', mean_absolute_error(y_test, predictions))

MSE:  1184.1141088715688
RMSE 34.41095913908197
MAE:  24.704369371817865


##### Elastic Net

In [30]:
from sklearn.linear_model import ElasticNetCV

elasnet = ElasticNetCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=10).fit(X_train, y_train)
predictions = elasnet.predict(X_test)
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE', np.sqrt(mean_squared_error(y_test, predictions)))
print('MAE: ', mean_absolute_error(y_test, predictions))

MSE:  1188.3697259337782
RMSE 34.472738880654354
MAE:  24.729973026955108


##### Ridge

In [31]:
from sklearn.linear_model import RidgeCV

ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=10).fit(X_train, y_train)
predictions = ridge.predict(X_test)
print('MSE: ', mean_squared_error(y_test, predictions))
print('RMSE', np.sqrt(mean_squared_error(y_test, predictions)))
print('MAE: ', mean_absolute_error(y_test, predictions))

MSE:  1195.21739698955
RMSE 34.57191630484995
MAE:  24.793971099156096


Elastic net looks to give the best results, we can now fill the missing values using this model.

In [32]:
def impute_missing_sqm(listing, model):
    if not np.isnan(listing['square_meters']):
        return listing['square_meters']
    predictors = listing.drop('square_meters').to_numpy().reshape(1,-1)
    return model.predict(predictors)[0]

In [33]:
listings['square_meters'] = listings_size_predictors_all.apply(
    lambda x: impute_missing_sqm(x, elasnet), axis=1
).astype('int')

Now we can drop the columns we won't use in our model.

In [34]:
cols = [
    'summary',
    'space',
    'description',
    'neighbourhood_overview',
    'notes',
    'transit',
    'access',
    'interaction',
    'house_rules',
]
listings.drop(cols, axis=1, inplace=True)

Now we have finished feature engineering and we have our data for modelling, we can write the data to a csv file.

In [35]:
listings.to_csv(PROCESSED_PATH/'listings_final.csv', index=False)