In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from numpy.random import permutation
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import plotly.graph_objects as go
import chart_studio

In [67]:
DATA_PATH = './data/'

In [68]:
###DATA WRANGLING/EDA

def wrangle(data):
    df = pd.read_csv(data,
                parse_dates=['last_scraped','host_since']
               ).set_index('last_scraped')
    
    #Dropping columns with high similarity columns
    df.drop(columns=['id','host_id','host_url','host_location',
                    'host_about','host_response_rate','host_listings_count',
                    'host_neighbourhood','host_total_listings_count',
                    'calendar_last_scraped','calculated_host_listings_count',
                    'calculated_host_listings_count_entire_homes',
                    'calculated_host_listings_count_private_rooms',
                    'calculated_host_listings_count_shared_rooms',
                    'host_has_profile_pic','bathrooms','has_availability'], inplace=True)
    
    #Dropping columns about reviews and columns with low gini importance
    df.drop(columns=['review_scores_accuracy','review_scores_cleanliness',
                    'review_scores_checkin', 'review_scores_communication','review_scores_location',
                    'review_scores_value','reviews_per_month','minimum_nights','maximum_nights',
                    'availability_30','availability_60','availability_90',
                    'availability_365','number_of_reviews_ltm','number_of_reviews_l30d','minimum_minimum_nights', 'maximum_minimum_nights',
                    'minimum_maximum_nights', 'maximum_maximum_nights',
                    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'longitude', 'latitude','neighbourhood'], inplace=True)
    
    #Converted target as object into float
    df['price']  = df['price'].str.strip('$')
    df['price']  = df['price'].str.replace(',','').astype(float)
    
    #Dropped outliers
    df = df.loc[df['price'] < 50000]
    
    numcard = 50

    highcard = [col for col in df.select_dtypes(exclude='number')
            if df[col].nunique() > numcard]
    lowcard = [col for col in df
            if df[col].nunique() <= 1]           
    df.drop(columns=highcard, inplace=True)
    df.drop(columns=lowcard, inplace=True)
    
    
    #Converted t and f object type into 1 and 0
    df.replace('t',1, inplace=True)
    df.replace('f',0, inplace=True)
    
    #Preprocessed NaN values
    df['host_response_time'].fillna('within an hour',inplace=True)
    df['bedrooms'].fillna(1,inplace=True)
    
    #Dropping rows with NA values
    df = df.dropna(axis=0, subset=['bedrooms', 'beds'])
  
    
    return df

In [69]:
dfc = wrangle(DATA_PATH+'chicago.csv')

In [70]:
dfc = dfc.drop(columns=['bathrooms_text','number_of_reviews','review_scores_rating','instant_bookable'])

In [71]:
df = dfc.dropna()

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6107 entries, 2021-04-22 to 2021-04-21
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   host_response_time      6107 non-null   object 
 1   host_is_superhost       6107 non-null   float64
 2   host_identity_verified  6107 non-null   float64
 3   property_type           6107 non-null   object 
 4   room_type               6107 non-null   object 
 5   accommodates            6107 non-null   int64  
 6   bedrooms                6107 non-null   float64
 7   beds                    6107 non-null   float64
 8   price                   6107 non-null   float64
dtypes: float64(5), int64(1), object(3)
memory usage: 477.1+ KB


In [73]:
df.index.value_counts()

2021-04-20    2700
2021-04-21    2332
2021-04-19     710
2021-04-22     352
2021-04-23      13
Name: last_scraped, dtype: int64

In [74]:
###SPLITTING DATA
target = 'price'
y = df[target]
X = df.drop(columns=target)

In [75]:
cutoff = '2021-04-22'
mask = X.index < cutoff
X_train, y_train = X.loc[mask], y.loc[mask]
X_test, y_test = X.loc[~mask], y.loc[~mask]

In [76]:
###ESTABLISHING BASELINE
print('Mean AirBnB Price:', y_train.mean())
y_pred = [y_train.mean()] * len(y_train)
print('Baseline MAE:', mean_absolute_error(y_train, y_pred))
print('Baseline RMSE:', mean_squared_error(y_train, y_pred, squared=False))

Mean AirBnB Price: 164.01671891327064
Baseline MAE: 115.1740184972268
Baseline RMSE: 375.24334943916784


In [77]:
###BUILDING MODELS

#Ridge Regressor

model_rr = make_pipeline(
    OneHotEncoder(),
    SimpleImputer(),
    Ridge(alpha=10)
)

model_rr.fit(X_train, y_train);    

  elif pd.api.types.is_categorical(cols):


In [78]:
#RandomForestRegressor model

model_rf = make_pipeline(
    OneHotEncoder(),
    SimpleImputer(),
    RandomForestRegressor(random_state=42)
)

model_rf.fit(X_train, y_train); 

  elif pd.api.types.is_categorical(cols):


In [21]:

#XGboostRegressor model

model_xgb = make_pipeline(
    OneHotEncoder(),
    SimpleImputer(),
    XGBRegressor(random_state=42, 
                 n_jobs=-1,
                 verbose=True)
);


model_xgb.fit(X_train, y_train);



Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [79]:
###CHECKING METRICS

#Ridge model metrics
y_pred_train_rr = model_rr.predict(X_train)
y_pred_test_rr = model_rr.predict(X_test)
train_MAE_rr = mean_absolute_error(y_train, y_pred_train_rr)
test_MAE_rr = mean_absolute_error(y_test, y_pred_test_rr)
train_RMSE_rr = mean_squared_error(y_train, y_pred_train_rr, squared=False)
test_RMSE_rr = mean_squared_error(y_test, y_pred_test_rr, squared=False)

print('RIDGE REGRESSOR METRICS:')
print('Train MAE:',train_MAE_rr)
print('Test MAE:', test_MAE_rr)
print('Train RMSE:', train_RMSE_rr)
print('Test RMSE:', test_RMSE_rr)
print('Training R^2:', r2_score(y_train, model_rr.predict(X_train)))
print('Test R^2:', r2_score(y_test, model_rr.predict(X_test)))
print('Training Accuracy:', model_rr.score(X_train, y_train))
print('Test Accuracy:', model_rr.score(X_test, y_test))

#RF Regressor metrics
y_pred_train_rf = model_rf.predict(X_train)
y_pred_test_rf = model_rf.predict(X_test)
train_MAE_rf = mean_absolute_error(y_train, y_pred_train_rf)
test_MAE_rf = mean_absolute_error(y_test, y_pred_test_rf)
train_RMSE_rf = mean_squared_error(y_train, y_pred_train_rf, squared=False)
test_RMSE_rf = mean_squared_error(y_test, y_pred_test_rf, squared=False)
print('')
print('RF REGRESSOR METRICS:')
print('Train MAE:',train_MAE_rf)
print('Test MAE:', test_MAE_rf)
print('Train RMSE:', train_RMSE_rf)
print('Test RMSE:', test_RMSE_rf)
print('Training R^2:', r2_score(y_train, model_rf.predict(X_train)))
print('Test R^2:', r2_score(y_test, model_rf.predict(X_test)))
print('Training Accuracy:', model_rf.score(X_train, y_train))
print('Test Accuracy:', model_rf.score(X_test, y_test))

#XGB model metrics
y_pred_train_xgb = model_xgb.predict(X_train)
y_pred_test_xgb = model_xgb.predict(X_test)
train_MAE_xgb = mean_absolute_error(y_train, y_pred_train_xgb)
test_MAE_xgb = mean_absolute_error(y_test, y_pred_test_xgb)
train_RMSE_xgb = mean_squared_error(y_train, y_pred_train_xgb, squared=False)
test_RMSE_xgb = mean_squared_error(y_test, y_pred_test_xgb, squared=False)
print('')
print('XGBOOST METRICS:')
print('Train MAE:',train_MAE_xgb)
print('Test MAE:', test_MAE_xgb)
print('Train RMSE:', train_RMSE_xgb)
print('Test RMSE:', test_RMSE_xgb)
print('Training R^2:', r2_score(y_train, model_xgb.predict(X_train)))
print('Test R^2:', r2_score(y_test, model_xgb.predict(X_test)))
print('Training Accuracy:', model_xgb.score(X_train, y_train))
print('Test Accuracy:', model_xgb.score(X_test, y_test))


RIDGE REGRESSOR METRICS:
Train MAE: 95.89187617868133
Test MAE: 78.48947022748082
Train RMSE: 346.11802182512616
Test RMSE: 137.2222899498369
Training R^2: 0.14920991870297795
Test R^2: -0.1479669157435115
Training Accuracy: 0.14920991870297795
Test Accuracy: -0.1479669157435115

RF REGRESSOR METRICS:
Train MAE: 59.1846148818388
Test MAE: 64.48897541252299
Train RMSE: 213.55373385918443
Test RMSE: 123.02173030706437
Training R^2: 0.6761168676890477
Test R^2: 0.07733571690654173
Training Accuracy: 0.6761168676890477
Test Accuracy: 0.07733571690654173


NameError: name 'model_xgb' is not defined

In [23]:
###TUNING (HYPERPARAM TUNING AND COMPARING GINI IMPORTANCE AND PERMUTATION IMPORTANCE)

#Hyperparam tuning for ridge model
rr_params = {
    'simpleimputer__strategy': ['mean', 'median'],
    'ridge__alpha': [1,5,10,15,20,25],
}

model_rr_gs = GridSearchCV(
    model_rr,
    param_grid=rr_params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_rr_gs.fit(X_train, y_train);

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [80]:


rf_params = {
    'simpleimputer__strategy': ['mean', 'median'],
    'randomforestregressor__n_estimators': [100,150,200,250,300],
    'randomforestregressor__max_depth': range(5, 36, 3)
}

model_rf_gs = GridSearchCV(
    model_rf,
    param_grid=rf_params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_rf_gs.fit(X_train, y_train);


Fitting 5 folds for each of 110 candidates, totalling 550 fits


  elif pd.api.types.is_categorical(cols):


In [25]:
xgb_params = {
    'simpleimputer__strategy': ['mean', 'median'],
    'xgbregressor__max_depth': range(3,15,3),
    'xgbregressor__gamma': range(1,5,1),
    'xgbregressor__n_estimators': range(5,30,5)
}

model_xgb_gs = GridSearchCV(
    model_xgb,
    param_grid=xgb_params,
    cv=5,
    n_jobs=-1,
    verbose=1
)

model_xgb_gs.fit(X_train, y_train);


Fitting 5 folds for each of 160 candidates, totalling 800 fits
Parameters: { verbose } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [26]:
#Ridge modelv2 metrics
y_pred_train_rr_gs = model_rr_gs.predict(X_train)
y_pred_test_rr_gs = model_rr_gs.predict(X_test)
train_MAE_rr_gs = mean_absolute_error(y_train, y_pred_train_rr_gs)
test_MAE_rr_gs = mean_absolute_error(y_test, y_pred_test_rr_gs)
train_RMSE_rr_gs = mean_squared_error(y_train, y_pred_train_rr_gs, squared=False)
test_RMSE_rr_gs = mean_squared_error(y_test, y_pred_test_rr_gs, squared=False)
print('RIDGE REGRESSORv2 METRICS:')
print('Train MAE:',train_MAE_rr_gs)
print('Test MAE:', test_MAE_rr_gs)
print('Train RMSE:', train_RMSE_rr_gs)
print('Test RMSE:', test_RMSE_rr_gs)
print('Training R^2:', r2_score(y_train, model_rr_gs.predict(X_train)))
print('Test R^2:', r2_score(y_test, model_rr_gs.predict(X_test)))
#RF modelv2 metrics
y_pred_train_rf_gs = model_rf_gs.predict(X_train)
y_pred_test_rf_gs = model_rf_gs.predict(X_test)
train_MAE_rf_gs = mean_absolute_error(y_train, y_pred_train_rf_gs)
test_MAE_rf_gs = mean_absolute_error(y_test, y_pred_test_rf_gs)
train_RMSE_rf_gs = mean_squared_error(y_train, y_pred_train_rf_gs, squared=False)
test_RMSE_rf_gs = mean_squared_error(y_test, y_pred_test_rf_gs, squared=False)
print('')
print('RF REGRESSORv2 METRICS:')
print('Train MAE:',train_MAE_rf_gs)
print('Test MAE:', test_MAE_rf_gs)
print('Train RMSE:', train_RMSE_rf_gs)
print('Test RMSE:', test_RMSE_rf_gs)
print('Training R^2:', r2_score(y_train, model_rf_gs.predict(X_train)))
print('Test R^2:', r2_score(y_test, model_rf_gs.predict(X_test)))

#XGB modelv2 metrics
y_pred_train_xgb_gs = model_xgb_gs.predict(X_train)
y_pred_test_xgb_gs = model_xgb_gs.predict(X_test)
train_MAE_xgb_gs = mean_absolute_error(y_train, y_pred_train_xgb_gs)
test_MAE_xgb_gs = mean_absolute_error(y_test, y_pred_test_xgb_gs)
train_RMSE_xgb_gs = mean_squared_error(y_train, y_pred_train_xgb_gs, squared=False)
test_RMSE_xgb_gs = mean_squared_error(y_test, y_pred_test_xgb_gs, squared=False)
print('')
print('XGBoost Regressorv2 METRICS:')
print('Train MAE:',train_MAE_xgb_gs)
print('Test MAE:', test_MAE_xgb_gs)
print('Train RMSE:', train_RMSE_xgb_gs)
print('Test RMSE:', test_RMSE_xgb_gs)
print('Training R^2:', r2_score(y_train, model_xgb_gs.predict(X_train)))
print('Test R^2:', r2_score(y_test, model_xgb_gs.predict(X_test)))
print('Model best score:', model_xgb_gs.best_score_)

RIDGE REGRESSORv2 METRICS:
Train MAE: 4658.311340822114
Test MAE: 4533.919147781359
Train RMSE: 6639.718790445849
Test RMSE: 6288.551618377498
Training R^2: 0.3632515485189397
Test R^2: 0.3186297566148112

RF REGRESSORv2 METRICS:
Train MAE: 4626.293278999918
Test MAE: 4650.272563888073
Train RMSE: 6547.03991097354
Test RMSE: 6417.435785755009
Training R^2: 0.3809032818820852
Test R^2: 0.29041411738141176

XGBoost Regressorv2 METRICS:
Train MAE: 4430.752888182196
Test MAE: 4478.413078796897
Train RMSE: 6336.498563066437
Test RMSE: 6255.114500360102
Training R^2: 0.4200811755072703
Test R^2: 0.3258563767500491
Model best score: 0.3366625586782572


In [27]:
#Gini importance
importances = model_xgb.named_steps['xgbregressor'].feature_importances_

feat_imp = pd.Series(importances, index=X.columns).sort_values()
feat_imp.tail(10).plot(kind='barh')
plt.xlabel('Gini importance')
plt.ylabel('Feature')
plt.title('Feature importance for model_xgb');

ValueError: Length of passed values is 101, index implies 12.

In [None]:
#Permutation importance
perm_imp = permutation_importance(model_xgb_gs,
                                   X_test,
                                   y_test,
                                   n_jobs=-1,
                                   random_state=42)

data_ = {'importances_mean' : perm_imp['importances_mean'],
        'importances_std' : perm_imp['importances_std']}


permdf = pd.DataFrame(data_, index=X_test.columns)
permdf.sort_values(by='importances_mean', inplace=True)

permdf['importances_mean'].tail(10).plot(kind='barh')
plt.xlabel('Importance (drop in accuracy)')
plt.ylabel('Feature')
plt.title('Permutation importance for model_xgb');

In [None]:
###COMMUNICATING RESULTS

feature = 'accommodates'

#Plotting partial dependency for 'accommodates' feature
isolate = pdp_isolate(
    model=model_xgb,
    dataset=X_test,
    model_features=X_test.columns,
    feature=feature
)

pdp_plot(isolate, feature_name=feature);

In [None]:
#Plotting partial dependency for the first top 2 features:
#('bedroom' and 'accomodates')

top2feat = ['accommodates', 'bedrooms']

interact = pdp_interact(
    model=model_xgb,
    dataset=X_test,
    model_features=X_test.columns,
    features=top2feat
)

pdp_interact_plot(interact, plot_type='grid',feature_names=top2feat);

In [None]:
location = ['longitude', 'latitude']


interact = pdp_interact(
    model=model_xgb,
    dataset=X_test,
    model_features=X_test.columns,
    features = location
)

pdp_interact_plot(interact, plot_type='grid',feature_names=location);

In [None]:
df.corr()

In [81]:
!pip install joblib



In [82]:
from joblib import dump

dump(model_rf_gs, "firstmodel.joblib", compress=True)

['firstmodel.joblib']