# Feature Engineering

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from pathlib import Path
import re
from collections import Counter
from math import sin, cos, sqrt, atan2, radians

In [11]:
# Read in the data
PROCESSED_PATH = Path('../data/processed/')
listings = pd.read_csv(PROCESSED_PATH/'cleaned_listings.csv')

In this file I will create new features from the existing ones. Features to create:

1. Indicator variables:
    - Neighbourhood overview
    - Interaction
    - House rules
    - Transit
1. Create a total price (price + cleaning fee) as another potential target variable.
1. Distance from different landmarks.
1. Amenities dummy variables and total number of amenities.
1. Square feet from description.
1. Investigate other features to see if any other information could be gained.

## Indicator Variables

In [10]:
def create_indicator_variable(dataframe, col):
    indicator = dataframe[col].notnull().astype('int')
    indicator.name = indicator.name + '_indicator'
    
    return pd.concat([dataframe, indicator], axis=1)

In [13]:
indicator_cols = ['neighbourhood_overview', 'interaction', 'house_rules', 'transit']
for col in indicator_cols:
    listings = create_indicator_variable(listings, col)

## Total Price

In [16]:
listings['total_price'] = listings['price'] + listings['cleaning_fee']

## Distance From Landmarks

In [17]:
def longlat_to_km(origin, destination):
    """
    Function that computes the distance between two (lat, long) points.
    """
    radius = 6371 # km
    lat1, long1 = origin
    lat2, long2 = destination
    
    dlat = radians(lat2 - lat1)
    dlong = radians(long2 - long1)
    
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlong / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = radius * c

    return distance

In [19]:
landmarks = {
    'Centre': (52.5200, 13.4050),
    'Brandenburg_Gate': (52.516266, 13.377775),
    'Berlin_Wall': (52.535152, 13.390206),
    'Reichstag': (52.518589, 13.376665),
    'Museum_Island': (52.516640, 13.402318),
    'Central_Station': (52.524929, 13.369181),
    'Telivision_Tower': (52.520817, 13.409419)
} # Coordinates found from latlong.net
for landmark, coordinates in landmarks.items():
    listings[f'distance_from_{landmark}'] = listings.apply(
        lambda listing: longlat_to_km((listing["latitude"], listing["longitude"]), coordinates), axis=1
    )