## Machine Learning Analysis

### Environment setup

In [1]:
! pip install -r requiriments.txt --quiet

In [2]:
import math
import sys
import numpy  as np  
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [3]:
# to load custom libraries
LIBRARY_PATH = "."
sys.path.append(LIBRARY_PATH)

# load custom libraries
from utilsDataProc import data_proc

### Load data

In [4]:
# load data
airbnb_data = pd.read_csv("../data/airbnb-listings-extract.csv", delimiter = ';') # load data
print(f'Number of samples: {airbnb_data.shape[0]}\nNumber of features: {airbnb_data.shape[1]}')

Number of samples: 14780
Number of features: 89


#### Split data into train and test datasets

In [5]:
# split train and test
df_train, df_test = train_test_split(airbnb_data, test_size = 0.3, shuffle = True, random_state = 0)
print(f'Samples in train dataset: {df_train.shape[0]}')
print(f'Samples in test dataset: {df_test.shape[0]}')

Samples in train dataset: 10346
Samples in test dataset: 4434


### Data processing

In [6]:
# process data
features = {
    'standarizate': ['Beds', 'Guests Included', 'Bathrooms'],
    'onehotencode': ['Room Type'],
    'targetencode': ['Room Type', 'Country Code'],
    'target': 'Price',
    'to_drop': ['translation missing: en.hosting_amenity_50', 'translation missing: en.hosting_amenity_49']
}
X_train, X_test, y_train, y_test = data_proc(df_train, df_test, features)



This warning appears because some amenities present in the training dataset are missing in the test dataset.

### Train model

In [7]:
# features for Room Type One Hot Encoding 
room_type_ohe = [col for col in X_train.columns if 'Room Type_' in col]

# selected features for Amenities
amenities_sel = ['dryer', 'indoor fireplace', 'no services', 'air conditioning', 'cable tv', 
                 'family/kid friendly', 'pool', 'tv', 'free parking on premises', 'hot tub', 
                 'lock on bedroom door', 'pets live on this property', 'wireless internet', 
                 'suitable for events', 'cat(s)']

# combinations of features to check 
dataproc = [features['standarizate'],
features['standarizate'] + room_type_ohe,
features['standarizate'] + room_type_ohe,
features['standarizate'] + ['Room Type TE'],
features['standarizate'] + room_type_ohe + ['Country Code TE'],
features['standarizate'] + ['Room Type TE', 'Country Code TE'],
features['standarizate'] + room_type_ohe + ['Country Code TE'] + amenities_sel,
features['standarizate'] + ['Room Type TE', 'Country Code TE'] + amenities_sel]

In [None]:
# combinations of model to check (algorithm + parameters)
experiments = {
    'algorithms': [KNeighborsRegressor(), Ridge(), RandomForestRegressor(random_state = 42), SVR()],
    'parameters': [{'n_neighbors': [6, 7, 8], 'weights': ['uniform', 'distance']},
                    {'alpha': [0.2, 0.4, 0.6]},
                    {'n_estimators': [50, 100], 'max_depth': [5, 10, 15]},
                    {'kernel': ['linear', 'rbf']}
                  ]
}

# run experiments: features + algorithm + parameters
results = []
for dp in dataproc:
    X_train_sel = X_train[dp]    
    for index, algorithm in enumerate(experiments['algorithms']):
        params = experiments['parameters'][index]
        grid = GridSearchCV(algorithm, param_grid = params, cv = 5, 
                            scoring= 'neg_mean_squared_error', return_train_score = True)
        grid.fit(X_train_sel, y_train)

        lst = list(grid.get_params()['param_grid'].values())
        for params, mse in zip(itertools.product(*lst),-grid.cv_results_['mean_test_score']):
            params_dict = {param: value for param, value in zip(grid.get_params()['param_grid'].keys(), params)}
            result = {'features': dp, 'algorithm': algorithm, 'parameters': params_dict,
                  'mse_val': mse}
            results.append(result)

# TODO show mean_train_score, mean_fit_time (grid.cv_results_)

In [None]:
best_model = pd.DataFrame(results).sort_values('mse_val').iloc[0,:].to_dict()
for key, value in best_model.items():
    print(f'{key}: {value}')

### Train/Test best model

In [None]:
model = RandomForestRegressor(random_state = 42, n_estimators = 100, max_depth = 5)
X_train_sel = X_train[best_model['features']]
model.fit(X_train_sel, y_train)
y_train_predict = model.predict(X_train_sel)
train_mse = mean_squared_error(y_train, y_train_predict)
print(f'MSE train: {train_mse}')

X_test_sel = X_test[best_model['features']]
y_test_predict = model.predict(X_test_sel)

test_mse = mean_squared_error(y_test, y_test_predict)
test_rmse = math.sqrt(test_mse)
test_r2 = model.score(X_test_sel, y_test)
print(f'\nMSE test: {test_mse}')
print(f'RMSE test: {test_rmse}')
print(f'R2 test: {test_r2}')

The best-performing model uses all available features, with improved results when applying target encoding to the 'Room Type' and 'Country Code' features. The top-performing algorithm is a RandomForestRegressor configured with 100 estimators and a maximum depth of 5. The MSE (Mean Squared Error) values are similar for both the train and test datasets, indicating no signs of overfitting.