# Description

The dataset is scraped of airbnb website. It contains various features listed in the end of this markdown. Also, we have calendar containing availability of houses at various periods of time. And lastly we have text reviews by customers regarding the house they stayed in. The goal of the model is to predict housing prices as close as plossible minimizing the MAPE (mean absolute percentage error). Due to diversity of data presented (textual, categorical and numerical) different preprocessing techniques were used.



*Features contained in train.csv:*<br>
id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,
interaction,house_rules,host_id,host_since,host_about,host_response_time,host_response_rate,
host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,zipcode,
latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,
beds,bed_type,amenities,square_feet,security_deposit,cleaning_fee,guests_included,extra_people,
minimum_nights,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,price

### Imports

In [None]:
# install geopandas if needed
!pip install geopandas

In [None]:
### Imports ###

import datetime 
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import math
from math import sin, cos, sqrt, atan2, radians
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from sklearn import datasets, linear_model, metrics, model_selection, pipeline, preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import xgboost as xgb
from sklearn import tree, ensemble, model_selection, linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import sklearn.preprocessing as preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import geopandas as gpd
from shapely.geometry import Point, Polygon
from sklearn.linear_model import LinearRegression
import datetime
from lightgbm import LGBMClassifier, LGBMRegressor, LGBMModel

# graphing defaults setup
pd.set_option('display.max_columns', None)
%matplotlib inline

# Data Processing

In [None]:
# for google collab usage

from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/ML/Houses/'

In [None]:
### Processing reviews and importing already sentimented reviews ###

sentiments_csv = pd.read_csv(path + 'sentimented_reviews.csv')
data_reviews_csv = pd.read_csv(path + 'reviews.csv', parse_dates=['date'])
last_date = data_reviews_csv['date'].max().date()

first_review = pd.DataFrame(data_reviews_csv.groupby(by = ['listing_id'])['date'].min())
last_review = pd.DataFrame(data_reviews_csv.groupby(by =['listing_id'])['date'].max())

days_since_first_review = first_review.applymap(lambda x:(last_date - x.date()).days)
days_since_last_review = last_review.applymap(lambda x:(last_date - x.date()).days)

sentiments_csv.loc[:, 'LastReview'] = days_since_last_review.reset_index(drop = False)['date']
sentiments_csv.loc[:, 'FirstReview'] = days_since_first_review.reset_index(drop = False)['date']

count = pd.DataFrame(data_reviews_csv['listing_id'].value_counts())
sentiments_csv['frequency'] = sentiments_csv.listing_id.map(dict(zip(count.index, count.listing_id)))
sentiments_csv['days_active'] = sentiments_csv.FirstReview - sentiments_csv.LastReview
sentiments_csv.drop(columns=['comments', 'LastReview', 'FirstReview'], inplace=True)

In [None]:
### Function for cleaning train and test data ###

CENTER_LAT = radians(51.49757)
CENTER_LONG = radians(-0.13585)
R = 6373.0
infrequent_amenities = []

def process_amenities(df):
  df.loc[df['amenities'].str.contains('24-hour check-in'), 'check_in_24h'] = 1
  df.loc[df['amenities'].str.contains('Air conditioning|Central air conditioning'), 'air_conditioning'] = 1
  df.loc[df['amenities'].str.contains('Amazon Echo|Apple TV|Game console|Netflix|Projector and screen|Smart TV'), 'high_end_electronics'] = 1
  df.loc[df['amenities'].str.contains('BBQ grill|Fire pit|Propane barbeque'), 'bbq'] = 1
  df.loc[df['amenities'].str.contains('Balcony|Patio'), 'balcony'] = 1
  df.loc[df['amenities'].str.contains('Beach view|Beachfront|Lake access|Mountain view|Ski-in/Ski-out|Waterfront'), 'nature_and_views'] = 1
  df.loc[df['amenities'].str.contains('Bed linens'), 'bed_linen'] = 1
  df.loc[df['amenities'].str.contains('Breakfast'), 'breakfast'] = 1
  df.loc[df['amenities'].str.contains('TV'), 'tv'] = 1
  df.loc[df['amenities'].str.contains('Coffee maker|Espresso machine'), 'coffee_machine'] = 1
  df.loc[df['amenities'].str.contains('Cooking basics'), 'cooking_basics'] = 1
  df.loc[df['amenities'].str.contains('Dishwasher|Dryer|Washer'), 'white_goods'] = 1
  df.loc[df['amenities'].str.contains('Elevator'), 'elevator'] = 1
  df.loc[df['amenities'].str.contains('Exercise equipment|Gym|gym'), 'gym'] = 1
  df.loc[df['amenities'].str.contains('Family/kid friendly|Children|children'), 'child_friendly'] = 1
  df.loc[df['amenities'].str.contains('parking'), 'parking'] = 1
  df.loc[df['amenities'].str.contains('Garden|Outdoor|Sun loungers|Terrace'), 'outdoor_space'] = 1
  df.loc[df['amenities'].str.contains('Host greets you'), 'host_greeting'] = 1
  df.loc[df['amenities'].str.contains('Hot tub|Jetted tub|hot tub|Sauna|Pool|pool'), 'hot_tub_sauna_or_pool'] = 1
  df.loc[df['amenities'].str.contains('Internet|Pocket wifi|Wifi'), 'internet'] = 1
  df.loc[df['amenities'].str.contains('Long term stays allowed'), 'long_term_stays'] = 1
  df.loc[df['amenities'].str.contains('Pets|pet|Cat(s)|Dog(s)'), 'pets_allowed'] = 1
  df.loc[df['amenities'].str.contains('Private entrance'), 'private_entrance'] = 1
  df.loc[df['amenities'].str.contains('Safe|Security system'), 'secure'] = 1
  df.loc[df['amenities'].str.contains('Self check-in'), 'self_check_in'] = 1
  df.loc[df['amenities'].str.contains('Smoking allowed'), 'smoking_allowed'] = 1
  df.loc[df['amenities'].str.contains('Step-free access|Wheelchair|Accessible'), 'accessible'] = 1
  df.loc[df['amenities'].str.contains('Suitable for events'), 'event_suitable'] = 1
  # Replacing nulls with zeros for new columns
  cols_to_replace_nulls = df.iloc[:,41:].columns
  df[cols_to_replace_nulls] = df[cols_to_replace_nulls].fillna(0)

  # Produces a list of amenity features where one category (true or false) contains fewer than 10% of listings
  
  for col in df.iloc[:,41:].columns:
      if df[col].value_counts()[1] < len(df)/10:
          infrequent_amenities.append(col)

  # Dropping infrequent amenity features
  df.drop(infrequent_amenities, axis=1, inplace=True)

  # Dropping the original amenity feature
  df.drop('amenities', axis=1, inplace=True)
  return df

def process_test_amenities(df):
  df.loc[df['amenities'].str.contains('24-hour check-in'), 'check_in_24h'] = 1
  df.loc[df['amenities'].str.contains('Air conditioning|Central air conditioning'), 'air_conditioning'] = 1
  df.loc[df['amenities'].str.contains('Amazon Echo|Apple TV|Game console|Netflix|Projector and screen|Smart TV'), 'high_end_electronics'] = 1
  df.loc[df['amenities'].str.contains('BBQ grill|Fire pit|Propane barbeque'), 'bbq'] = 1
  df.loc[df['amenities'].str.contains('Balcony|Patio'), 'balcony'] = 1
  df.loc[df['amenities'].str.contains('Beach view|Beachfront|Lake access|Mountain view|Ski-in/Ski-out|Waterfront'), 'nature_and_views'] = 1
  df.loc[df['amenities'].str.contains('Bed linens'), 'bed_linen'] = 1
  df.loc[df['amenities'].str.contains('Breakfast'), 'breakfast'] = 1
  df.loc[df['amenities'].str.contains('TV'), 'tv'] = 1
  df.loc[df['amenities'].str.contains('Coffee maker|Espresso machine'), 'coffee_machine'] = 1
  df.loc[df['amenities'].str.contains('Cooking basics'), 'cooking_basics'] = 1
  df.loc[df['amenities'].str.contains('Dishwasher|Dryer|Washer'), 'white_goods'] = 1
  df.loc[df['amenities'].str.contains('Elevator'), 'elevator'] = 1
  df.loc[df['amenities'].str.contains('Exercise equipment|Gym|gym'), 'gym'] = 1
  df.loc[df['amenities'].str.contains('Family/kid friendly|Children|children'), 'child_friendly'] = 1
  df.loc[df['amenities'].str.contains('parking'), 'parking'] = 1
  df.loc[df['amenities'].str.contains('Garden|Outdoor|Sun loungers|Terrace'), 'outdoor_space'] = 1
  df.loc[df['amenities'].str.contains('Host greets you'), 'host_greeting'] = 1
  df.loc[df['amenities'].str.contains('Hot tub|Jetted tub|hot tub|Sauna|Pool|pool'), 'hot_tub_sauna_or_pool'] = 1
  df.loc[df['amenities'].str.contains('Internet|Pocket wifi|Wifi'), 'internet'] = 1
  df.loc[df['amenities'].str.contains('Long term stays allowed'), 'long_term_stays'] = 1
  df.loc[df['amenities'].str.contains('Pets|pet|Cat(s)|Dog(s)'), 'pets_allowed'] = 1
  df.loc[df['amenities'].str.contains('Private entrance'), 'private_entrance'] = 1
  df.loc[df['amenities'].str.contains('Safe|Security system'), 'secure'] = 1
  df.loc[df['amenities'].str.contains('Self check-in'), 'self_check_in'] = 1
  df.loc[df['amenities'].str.contains('Smoking allowed'), 'smoking_allowed'] = 1
  df.loc[df['amenities'].str.contains('Step-free access|Wheelchair|Accessible'), 'accessible'] = 1
  df.loc[df['amenities'].str.contains('Suitable for events'), 'event_suitable'] = 1
  # Replacing nulls with zeros for new columns
  cols_to_replace_nulls = df.iloc[:,41:].columns
  df[cols_to_replace_nulls] = df[cols_to_replace_nulls].fillna(0)

  # Dropping infrequent amenity features
  df.drop(infrequent_amenities, axis=1, inplace=True)

  # Dropping the original amenity feature
  df.drop('amenities', axis=1, inplace=True)
  return df


#GeoData on London's hoods
map_df = gpd.read_file(path + 'neighbourhoods.geojson')
del map_df['neighbourhood_group']
map_df.head(5)
subset = map_df[['neighbourhood', 'geometry']]
areas = [tuple(x) for x in subset.to_numpy()]

def get_hood(df):
    gdf = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(df.longitude, df.latitude))
    points = gdf.geometry
    hoods = []

    for row in points:
      found = False
      for i in range(len(areas)):
        if (areas[i][1].contains(row)):
          hoods.append(areas[i][0])
          found = True
          break
      if (not found):
          hoods.append('NAN')
      
    return pd.DataFrame(hoods)
    
def correct_zip(zipcode):
    result = re.search("([A-Z]+[0-9]+)", zipcode)
    if (result == None):
      return 'NAN'
    else:
      return result.group()

def process_zipcode(df):
    f = lambda row : correct_zip(str(row['zipcode']).upper().split(' ')[0])
    df = df.apply(f, axis=1)
    return df

def change_nan(data):
    if math.isnan(data):
        data = -1
    return data

def change_2(data):
    if data > 1.0:
        data = 1.0
    return data

def process_2(df):
    df = df.apply(lambda x: change_2(x))
    return df

def normalize(df):
    result = pd.DataFrame(df.copy())
    print(type(result))
    for feature_name in result:
        max_value = result[feature_name].max()
        min_value = result[feature_name].min()
        result[feature_name] = (result[feature_name] - min_value) \
                / (max_value - min_value)
    return result

tokenizer = RegexpTokenizer(r'\w+')

def remove_punctuation(text):
    #exclude = set(string.punctuation)
    #no_punct = " ".join([c for c in text if c not in exclude])
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return text.translate(translator)

stemmer = PorterStemmer()

def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text

def prep_for_vect(df):
    df = df.apply(lambda x: remove_punctuation(x))
    df = df.apply(lambda x: tokenizer.tokenize(x.lower()))
    df = df.apply(lambda x: ' '. join(x))
    #df = df.apply(lambda x: word_stemmer(x))
    return df

def word_to_vec(df):
    v = TfidfVectorizer()
    x = v.fit_transform(prep_for_vect(df))
    df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
    return df1.loc[:, v.get_feature_names()[0]:]. \
        apply(lambda x : x.tolist(), axis=1)

def get_distance_from_center(lat, long):
    lat = radians(lat)
    long = radians(long)
    dlon = CENTER_LONG - long
    dlat = CENTER_LAT - lat
    a = sin(dlat / 2)**2 + cos(lat) * cos(CENTER_LAT) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    return R * c
    
def process_geo(lat, long):
    #norm_lat = normalize(np.radians(lat))
    #norm_long = normalize(np.radians(long))
    #long = pd.DataFrame(long, columns=['longitude'])
    #lat = pd.DataFrame(lat, columns=['latitude'])
    result = pd.concat([long, lat], axis = 1)
    #return result.loc[:, :].apply(lambda x : x.tolist(), axis=1)
    result = pd.DataFrame(result.apply( \
        lambda row : get_distance_from_center( \
        row['latitude'], row['longitude']), axis=1))
    return result

def normalize_frame(dataset):
    dataNorm=((dataset-dataset.min())/(dataset.max()-dataset.min()))
    return dataNorm

def encode(dataset, title):
    enum_data = pd.factorize(dataset)[0]
    dataset = pd.DataFrame(dataset)
    dataset[title] = enum_data
    return pd.get_dummies(dataset[title])

vectorizer = CountVectorizer()

def vectorize_train_text(df):
    df['room_type'] = df['room_type'].astype(str)
    df['hood'] = df['hood'].astype(str)
    df['text'] = df[['room_type', 'hood']].agg(' '.join, axis=1)
    del df['room_type']
    vectorizer.fit(df['text'])
    X = vectorizer.transform(df['text'])
    matrix = X.toarray()
    df = df.reset_index(drop=True)
    df = pd.concat([df, pd.DataFrame(matrix).reset_index(drop=True)], axis = 1)
    del df['text']

    return df

def vectorize_test_text(df): 
    df['room_type'] = df['room_type'].astype(str)
    df['hood'] = df['hood'].astype(str)
    df['text'] = df[['room_type', 'hood']].agg(' '.join, axis=1)
    del df['room_type']

    X = vectorizer.transform(df['text'])
    matrix = X.toarray()
    df = df.reset_index(drop=True)
    df = pd.concat([df, pd.DataFrame(matrix).reset_index(drop=True)], axis = 1)
    del df['text']

    return df

def normalize_column(df, feature_name):
    result = df.copy()
    max_value = df[feature_name].max()
    min_value = df[feature_name].min()
    result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

def get_middle(df,percent):

    start = int(len(df)*percent)
    end = len(df) - start

    return df.iloc[start:end]

In [None]:
### Pipeline for train data processing ###

class TrainDataCleaner(TransformerMixin):

    def transform(self, X, **transform_params): 
        result = pd.DataFrame(X['id'])
        #clean up simple numeric data
        guests_included = pd.to_numeric(X.guests_included)
        accommodates = pd.to_numeric(X.accommodates)
        #zipcode = pd.DataFrame(process_zipcode(X))
        bathrooms = pd.to_numeric(X.bathrooms.fillna(X.bathrooms.median()))
        bedrooms = pd.to_numeric(X.bedrooms.fillna(X.bedrooms.mean()))
        beds = pd.to_numeric(X.beds.fillna(X.beds.mean()))
        
        #geodata processing
        lat = X['latitude']
        long = X['longitude']
        distance_from_center = process_geo(lat, long)
        
        #collect numeric data and normalize
        result['guests_included'] = guests_included
        result['accomodates'] = accommodates
        result['bathroom'] = bathrooms
        result['bedrooms'] = bedrooms
        result['beds'] = beds
        result['distance_from_center'] = distance_from_center
        result = normalize_frame(result)

        #remove values without zipcode, and no prices
        result['amenities'] = X.amenities
        result = process_amenities(result)
        result['room_type'] = X.room_type
        result['price'] = X.price
        result['hood'] = get_hood(X)
        result = vectorize_train_text(result)
        result['listing_id'] = X['id']
        result = result.join(sentiments_csv.set_index('listing_id'), on='listing_id')
        
        result = result[result.price != 0]
        result = result[result.frequency > 3]
        result = result[result.days_active != 0]
        result = normalize_column(result, 'frequency')
        result = normalize_column(result, 'days_active')
        result = result[result.hood != 'NAN']
        result.drop(columns=['sentiment', 'hood', 'id', 'listing_id'], inplace=True)

        #vectorize textual data      
        
        result.fillna(value=0, inplace=True)
        

        return result
        
    def fit(self, X, y=None, **fit_params):
        return self
    
train_data_cleaner = Pipeline([
    ('clean data', TrainDataCleaner())
])

In [None]:
### Pipeline for cleaning test data ###

class TestDataCleaner(TransformerMixin):

  def transform(self, X, **transform_params): 
        result = pd.DataFrame(X['id'])
        #clean up simple numeric data
        guests_included = pd.to_numeric(X.guests_included)
        accommodates = pd.to_numeric(X.accommodates)
        #zipcode = pd.DataFrame(process_zipcode(X))
        bathrooms = pd.to_numeric(X.bathrooms.fillna(X.bathrooms.median()))
        bedrooms = pd.to_numeric(X.bedrooms.fillna(X.bedrooms.mean()))
        beds = pd.to_numeric(X.beds.fillna(X.beds.mean()))
        
        #geodata processing
        lat = X['latitude']
        long = X['longitude']
        distance_from_center = process_geo(lat, long)
        
        #collect numeric data and normalize
        result['guests_included'] = guests_included
        result['accomodates'] = accommodates
        result['bathroom'] = bathrooms
        result['bedrooms'] = bedrooms
        result['beds'] = beds
        result['distance_from_center'] = distance_from_center
        result = normalize_frame(result)

        #remove values without zipcode, and no prices
        result['amenities'] = X.amenities
        result = process_test_amenities(result)
        result['room_type'] = X.room_type
        result['hood'] = get_hood(X)
        result = vectorize_test_text(result)
        result['listing_id'] = X['id']
        result = result.join(sentiments_csv.set_index('listing_id'), on='listing_id')

        result.loc[result['frequency'] < 3, 'frequency'] = result['frequency'].median()
        result.loc[result['days_active'] == 0, 'days_active'] = result['days_active'].median()

        result = normalize_column(result, 'frequency')
        result = normalize_column(result, 'days_active')
        result.drop(columns=['sentiment', 'hood', 'id', 'listing_id'], inplace=True)

        #vectorize textual data      
        
        result.fillna(value=0, inplace=True)


        return result
        
  def fit(self, X, y=None, **fit_params):
      return self
    
test_data_cleaner = Pipeline([
    ('clean data', TestDataCleaner())
])

# Model

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    mask = np.where(y_true==0, True, False) 
    return np.nanmean(np.abs((y_true[~mask] - y_pred[~mask]) / y_true[~mask]))

In [None]:
data_train_csv = pd.read_csv(path + 'train.csv')
part = int(len(data_train_csv) * 0.8)
train_data_raw = data_train_csv.iloc[:part]
test_data_raw = data_train_csv.iloc[part:]

train_data = train_data_cleaner.transform(train_data_raw)
test_data = test_data_cleaner.transform(test_data_raw)

train_target = train_data['price']
test_target = test_data_raw['price']

train_data.drop(columns=['price'], inplace=True)

In [None]:
rus = RandomUnderSampler(random_state=42, sampling_strategy='majority')
X_res, y_res = rus.fit_resample(train_data, train_target)

rf_model = ensemble.RandomForestClassifier(n_estimators = 90, min_samples_split = 15, random_state = 1)
rf_model.fit(X_res, y_res)


In [None]:
train_pred_np = rf_model.predict(train_data)
test_pred_np = rf_model.predict(test_data)
test_target_np = test_target.to_numpy()
train_target_np = train_target.to_numpy()

In [None]:
print(f'train: {mean_absolute_percentage_error(train_target_np, train_pred_np)}')
print(f'test: {mean_absolute_percentage_error(test_target_np, test_pred_np)}')

# Final submission

In [None]:
submission = pd.read_csv(path + 'sample_submission.csv')

In [None]:
data_test_csv = pd.read_csv(path + 'test.csv')
final_test_data = test_data_cleaner.transform(data_test_csv)
final_pred = rf_model.predict(final_test_data)
submission['price'] = final_pred

In [None]:
submission.to_csv(path + 'random_forest.csv', index = False)