# TODO

Exploratory Data Analysis
- [X] Maybe make a pairplot?
- [ ] [Camila] Compute and plot the correlation between features
- [ ] [Camila] Analyze Null values -> There are none, but we need to write about it
- [ ] [Camila] Check outliers
- [ ] [Camila] Take care of the Lat/Long variables (how are we going to use them?)

Pre-Modeling
- [X] Split the data into train and test
- [X] Decide what are our target variable(s)
- [X] Create the grid search for hyper parameter optimization and model selection
- [ ] Apply PCA to see if we can benefit from it (Low prio)

Modeling
- [X] [Dio] Check Feature Selection
- [ ] Try to build a classifier (Low prio)
- [ ] [Dio] Feature Importance

Post-modeling
- [ ] [Camila] Plot residuals

# Notes

- [Link to paper](https://zenodo.org/records/4446043)

# Airbnb: Amsterdam x Paris

In [1]:
# !pip install ydata_profiling ipywidgets

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from ydata_profiling import ProfileReport

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import linear_model
from sklearn import pipeline
from sklearn import ensemble
from sklearn import compose
from sklearn import feature_selection
from sklearn import neural_network

In [3]:
SEED = 13
TEST_DATASET_RATIO = 0.3
RAW_DATA_PATH = '../raw_data'
DTYPES = {
    'realSum':                    'float64',
    'room_type':                  'category',  # Three possible categories: ['Entire home/apt', 'Private room', 'Shared room']
    'room_shared':                'bool',
    'room_private':               'bool',
    'person_capacity':            'int8',
    'host_is_superhost':          'bool',
    'multi':                      'bool',  # multi seems to be a boolean variable.
    'biz':                        'bool',  # biz seems to be a boolean variable.
    'cleanliness_rating':         'int8',
    'guest_satisfaction_overall': 'int16',
    'bedrooms':                   'int8',
    'dist':                       'float64',
    'metro_dist':                 'float64',
    'attr_index':                 'float64',
    'attr_index_norm':            'float64',
    'rest_index':                 'float64',
    'rest_index_norm':            'float64',
    'lng':                        'float64',
    'lat':                        'float64'
}

In [4]:
def _read_csv(path: str) -> pd.DataFrame:
    """
    Helper function to read csv's, set their index properly and convert the datatypes here accordingly.
    Many times the automatic detection was using Int64 for variables with a small range of values (like `person_capacity`),
    using integeres instead of booleans (for `multi` and `biz`) or not using the categorical datatype (`room_type`).
    """
    return pd.read_csv(
        path,
        index_col=0,
        dtype=DTYPES
    )


def _read_city(city: str) -> pd.DataFrame:
    """
    Helper function to generate a merged dataframe for a specified @city.
    It combines the weekdays and the weekends data into a single dataframe.
    """
    assert city in ['amsterdam', 'paris'], "ERROR: Invalid city. Options are 'amsterdam' or 'paris'"
    
    weekday_path = f"{RAW_DATA_PATH}/{city}_weekdays.csv"
    weekday = _read_csv(weekday_path)
    weekday['is_weekend'] = False

    weekend_path = f"{RAW_DATA_PATH}/{city}_weekends.csv"
    weekend = _read_csv(weekend_path)
    weekend['is_weekend'] = True

    merged = pd.concat([weekday, weekend])
    merged['city'] = city

    return merged.reset_index(drop=True)


def get_merged_df() -> pd.DataFrame:
    """
    Reads the data from both files (weekends and weekdays) and both cities (Amsterdam and Paris) and merge them into a single dataframe.
    It also corrects the datatypes and sets the index properly.
    """
    amsterdam = _read_city('amsterdam')
    paris = _read_city('paris')
    
    merged = pd.concat([amsterdam, paris])
    merged = merged.reset_index(drop=True)
    merged['city'] = merged['city'].astype("category")
    
    return merged

In [5]:
merged_df = get_merged_df()
merged_df.head()

Unnamed: 0,realSum,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,...,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,is_weekend,city
0,194.033698,Private room,False,True,2,False,True,False,10,93,...,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,4.90569,52.41772,False,amsterdam
1,344.245776,Private room,False,True,4,False,False,False,8,85,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,4.90005,52.37432,False,amsterdam
2,264.101422,Private room,False,True,2,False,False,True,9,87,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,4.97512,52.36103,False,amsterdam
3,433.529398,Private room,False,True,4,False,False,True,9,90,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,4.89417,52.37663,False,amsterdam
4,485.552926,Private room,False,True,2,True,False,False,10,98,...,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,4.90051,52.37508,False,amsterdam


## Notes
I am stratifying the train-test split by `city` and by `is_weekend`.

## Train Test Split

In [6]:
X = merged_df.drop('realSum', axis=1)
y = merged_df['realSum']

In [7]:
stratify_col = 2 * preprocessing.LabelEncoder().fit_transform(X['city']) + preprocessing.LabelEncoder().fit_transform(X['is_weekend'])

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X,
    y,
    test_size=TEST_DATASET_RATIO,
    random_state=SEED,
    shuffle=True,
    stratify=stratify_col
)

In [9]:
y_train

5367    339.500419
5355    324.354553
5607    316.199087
5239    486.531830
4545    669.913319
           ...    
1859    276.287114
7862    227.654022
6599    356.976419
1913    448.527172
4808    237.207568
Name: realSum, Length: 6137, dtype: float64

In [10]:
X_train

Unnamed: 0,room_type,room_shared,room_private,person_capacity,host_is_superhost,multi,biz,cleanliness_rating,guest_satisfaction_overall,bedrooms,dist,metro_dist,attr_index,attr_index_norm,rest_index,rest_index_norm,lng,lat,is_weekend,city
5367,Entire home/apt,False,False,2,False,False,True,9,95,0,0.883243,0.184813,459.562517,22.346261,1114.340004,51.407103,2.35700,48.86400,True,paris
5355,Private room,False,True,3,False,False,True,9,89,2,1.189804,0.185527,731.393669,35.564070,1588.161245,73.265581,2.33700,48.85400,True,paris
5607,Entire home/apt,False,False,4,False,False,True,9,88,1,0.820219,0.350694,489.655080,23.809514,1053.609050,48.605442,2.36247,48.86021,True,paris
5239,Entire home/apt,False,False,4,False,False,False,9,95,0,0.321040,0.312985,778.098033,37.835073,1181.983859,54.527671,2.35327,48.85370,True,paris
4545,Private room,False,True,4,False,False,False,10,100,1,5.288413,0.380616,364.491874,17.743342,565.843654,33.248711,2.28068,48.85790,False,paris
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1859,Private room,False,True,2,False,False,False,10,80,1,4.199364,3.033310,92.990260,4.923408,117.525496,10.245402,4.89030,52.41094,True,amsterdam
7862,Entire home/apt,False,False,2,False,False,False,10,90,1,6.083895,0.248021,228.374698,11.104736,424.000999,19.560155,2.27188,48.84444,True,paris
6599,Entire home/apt,False,False,4,True,False,False,9,98,1,4.205269,0.188261,296.964766,14.439933,661.650929,30.523500,2.32900,48.89100,True,paris
1913,Entire home/apt,False,False,2,False,True,False,10,100,1,1.822919,1.945724,208.904591,11.060542,270.516362,23.582533,4.87299,52.38405,True,amsterdam


# Exploratory Data Analysis

In [11]:
profile = ProfileReport(X_train, title="Profiling Report")

In [12]:
# sns.pairplot(pd.concat([X_train, y_train], axis=1))

In [13]:
X_train[['bedrooms', 'person_capacity', 'room_type']].query('bedrooms == 0').value_counts()

bedrooms  person_capacity  room_type      
0         2                Entire home/apt    710
          4                Entire home/apt    103
          2                Private room        94
          3                Entire home/apt     86
          5                Entire home/apt      5
          3                Private room         3
          4                Private room         3
          6                Entire home/apt      3
Name: count, dtype: int64

# Model Training and Selection

In [14]:
columns_to_drop = [
    'attr_index',
    'rest_index',
    'lat',
    'lng',
    'room_shared',
    'room_private',
    'bedrooms'
]

In [15]:
encoder = compose.make_column_transformer(
    (
        'scaler',
        X_train.select_dtypes('number').columns
    ),
    (
        preprocessing.OneHotEncoder(sparse_output=False, drop='first'),
        X_train.select_dtypes('category').columns
    ),
    (
        preprocessing.OneHotEncoder(sparse_output=False, drop='first'),
        X_train.select_dtypes('bool').columns
    ),
    (
        'drop',
        columns_to_drop
    ),
    remainder='passthrough'
)

In [16]:
model = pipeline.Pipeline([
    ('encoder', encoder),
    ('feat_select', None),
    ('regressor', None),
])

In [17]:
parameters = [
    {
        "encoder__scaler": [preprocessing.StandardScaler()],
        
        "feat_select": [feature_selection.GenericUnivariateSelect()],
        "feat_select__score_func": [feature_selection.mutual_info_regression],
        "feat_select__mode": ["k_best"],
        "feat_select__param": [4, 7, 10, 15, 18, 'all'],
        
        "regressor": [linear_model.Ridge()],
        "regressor__alpha": [0.1, 0.5, 1.0]
    },
    {
        "encoder__scaler": [preprocessing.StandardScaler()],
        
        "feat_select": [feature_selection.GenericUnivariateSelect()],
        "feat_select__score_func": [feature_selection.mutual_info_regression],
        "feat_select__mode": ["k_best"],
        "feat_select__param": [4, 7, 10, 15, 18, 'all'],
        
        "regressor": [ensemble.RandomForestRegressor()],
        "regressor__n_estimators": [10, 50, 100],
        "regressor__min_samples_leaf": [1, 5, 10],
        "regressor__max_features": [1, 'sqrt', 'log2'],
    },
    # {
    #     "encoder__scaler": [preprocessing.StandardScaler()],
        
    #     "feat_select": [feature_selection.GenericUnivariateSelect()],
    #     "feat_select__score_func": [feature_selection.mutual_info_regression],
    #     "feat_select__mode": ["k_best"],
    #     "feat_select__param": [4, 7, 10, 15, 18, 'all'],
        
    #     "regressor": [neural_network.MLPRegressor()],
    #     "regressor__max_iter": [1_000],
    #     "regressor__alpha": [0.001],
    #     "regressor__learning_rate_init": [0.01],
    #     "regressor__hidden_layer_sizes": [(2,), (8,), (64,), (256,), (2, 2), (4, 4)]
        
    # }
]

In [18]:
%%time
clf = model_selection.GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv=5, verbose=3, n_jobs=-1, error_score='raise')  # Stratified by default
clf.fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 5/5] END encoder__scaler=StandardScaler(), feat_select=GenericUnivariateSelect(), feat_select__mode=k_best, feat_select__param=4, feat_select__score_func=<function mutual_info_regression at 0x7fcc7aae4680>, regressor=Ridge(), regressor__alpha=0.5;, score=-114571.759 total time=   0.6s
[CV 2/5] END encoder__scaler=StandardScaler(), feat_select=GenericUnivariateSelect(), feat_select__mode=k_best, feat_select__param=10, feat_select__score_func=<function mutual_info_regression at 0x7fcc7aae4680>, regressor=Ridge(), regressor__alpha=0.1;, score=-59315.366 total time=   0.6s
[CV 2/5] END encoder__scaler=StandardScaler(), feat_select=GenericUnivariateSelect(), feat_select__mode=k_best, feat_select__param=15, feat_select__score_func=<function mutual_info_regression at 0x7fcc7aae4680>, regressor=Ridge(), regressor__alpha=0.5;, score=-55800.263 total time=   0.6s
[CV 1/5] END encoder__scaler=StandardScaler(), feat_select=GenericU

In [19]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_encoder__scaler,param_feat_select,param_feat_select__mode,param_feat_select__param,param_feat_select__score_func,param_regressor,...,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.609972,0.053786,0.006944,0.000994,StandardScaler(),GenericUnivariateSelect(),k_best,4,<function mutual_info_regression at 0x7fa120e6...,Ridge(),...,,"{'encoder__scaler': StandardScaler(), 'feat_se...",-54908.773714,-83848.573272,-272323.449619,-90808.903871,-114629.404422,-123303.820980,76905.934651,172
1,0.611109,0.030713,0.007241,0.001273,StandardScaler(),GenericUnivariateSelect(),k_best,4,<function mutual_info_regression at 0x7fa120e6...,Ridge(),...,,"{'encoder__scaler': StandardScaler(), 'feat_se...",-54881.671743,-83829.225004,-272277.980472,-90864.692595,-114571.759233,-123285.065809,76891.714279,171
2,0.621301,0.043467,0.007609,0.001264,StandardScaler(),GenericUnivariateSelect(),k_best,4,<function mutual_info_regression at 0x7fa120e6...,Ridge(),...,,"{'encoder__scaler': StandardScaler(), 'feat_se...",-54872.009978,-83824.681348,-272239.175674,-90925.065099,-114536.288789,-123279.444178,76874.580432,170
3,0.608686,0.045506,0.007767,0.001308,StandardScaler(),GenericUnivariateSelect(),k_best,7,<function mutual_info_regression at 0x7fa120e6...,Ridge(),...,,"{'encoder__scaler': StandardScaler(), 'feat_se...",-54713.220750,-84284.493408,-256689.444428,-90640.245552,-115473.862657,-120360.253359,70857.317706,165
4,0.554496,0.009460,0.005846,0.000050,StandardScaler(),GenericUnivariateSelect(),k_best,7,<function mutual_info_regression at 0x7fa120e6...,Ridge(),...,,"{'encoder__scaler': StandardScaler(), 'feat_se...",-38285.036805,-84341.000222,-271814.061365,-90834.551865,-89969.595121,-115048.849076,80779.066357,135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,1.258477,0.046169,0.021433,0.004597,StandardScaler(),GenericUnivariateSelect(),k_best,all,<function mutual_info_regression at 0x7fa120e6...,RandomForestRegressor(),...,50,"{'encoder__scaler': StandardScaler(), 'feat_se...",-24485.301517,-44847.958460,-238532.795179,-51259.720168,-72855.075829,-86396.170231,77617.902030,15
176,1.595555,0.055324,0.018962,0.002037,StandardScaler(),GenericUnivariateSelect(),k_best,all,<function mutual_info_regression at 0x7fa120e6...,RandomForestRegressor(),...,100,"{'encoder__scaler': StandardScaler(), 'feat_se...",-23667.213267,-44449.068586,-237755.675128,-50997.382059,-72176.015661,-85809.070941,77534.179138,4
177,0.830764,0.020187,0.010649,0.001462,StandardScaler(),GenericUnivariateSelect(),k_best,all,<function mutual_info_regression at 0x7fa120e6...,RandomForestRegressor(),...,10,"{'encoder__scaler': StandardScaler(), 'feat_se...",-28452.387073,-47896.896648,-238728.754002,-55072.128483,-79995.499312,-90029.133104,76159.128455,54
178,1.005562,0.047131,0.010640,0.000124,StandardScaler(),GenericUnivariateSelect(),k_best,all,<function mutual_info_regression at 0x7fa120e6...,RandomForestRegressor(),...,50,"{'encoder__scaler': StandardScaler(), 'feat_se...",-24860.288356,-46239.612991,-238682.341267,-52351.137676,-76265.379851,-87679.752028,77257.686971,27


In [20]:
clf.best_estimator_

In [21]:
clf.best_estimator_.get_params()

{'memory': None,
 'steps': [('encoder',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('scaler', StandardScaler(),
                                    Index(['person_capacity', 'cleanliness_rating', 'guest_satisfaction_overall',
          'bedrooms', 'dist', 'metro_dist', 'attr_index', 'attr_index_norm',
          'rest_index', 'rest_index_norm', 'lng', 'lat'],
         dtype='object')),
                                   ('onehotencoder-1',
                                    OneHotEncoder(drop='first',
                                                  sparse_output=False),
                                    Index(['room_type', 'city'], dtype='object')),
                                   ('onehotencoder-2',
                                    OneHotEncoder(drop='first',
                                                  sparse_output=False),
                                    Index(['room_shared', 'room_private', 'host_is_superhost', 'multi', 'biz'

In [22]:
clf.best_estimator_.score(X_train, y_train)

0.9030886394983859

In [23]:
pd.DataFrame(y_train)

Unnamed: 0,realSum
5367,339.500419
5355,324.354553
5607,316.199087
5239,486.531830
4545,669.913319
...,...
1859,276.287114
7862,227.654022
6599,356.976419
1913,448.527172


In [24]:
pd.DataFrame(clf.predict(X_train))

Unnamed: 0,0
0,340.576941
1,325.421754
2,355.573679
3,508.854506
4,638.079038
...,...
6132,276.605816
6133,244.822444
6134,331.992730
6135,484.770229


In [25]:
pd.concat([pd.DataFrame(clf.predict(X_train)), y_train.reset_index(drop=True), X_train[['city', 'is_weekend']].reset_index(drop=True)], axis=1)

Unnamed: 0,0,realSum,city,is_weekend
0,340.576941,339.500419,paris,True
1,325.421754,324.354553,paris,True
2,355.573679,316.199087,paris,True
3,508.854506,486.531830,paris,True
4,638.079038,669.913319,paris,False
...,...,...,...,...
6132,276.605816,276.287114,amsterdam,True
6133,244.822444,227.654022,paris,True
6134,331.992730,356.976419,paris,True
6135,484.770229,448.527172,amsterdam,True
