# Machine Learning Modeling
- Name: Minh T. Nguyen
- Date: 11/24/2023
- About:
    - Renting price prediction with classical ML model on the dataset (without image feature extraction)
    - For the sake of simplicity, we will only split the dataset into train/test. No need for grid-search, validation set, or k-fold validation for this project. For model comparison, we will only check for test accuracy for simplicity.
    - Models to consider:
        - KNN
        - Neural Networks
        - SVM
        - Decision Tree
        - Random Forests

In [1]:
!ls ../data

final_dataset_image.json     sentimental_extraction_kaggle.csv
final_dataset_no_image.json  sentimental_extraction_kaggle.json
images_sample		     sentimental_extraction_sample.csv
Kaggle-renthop.torrent	     train.json


**Note:** The datasets can be found [here]((https://www.kaggle.com/competitions/two-sigma-connect-rental-listing-inquiries/data?select=train.json.zip)).
- train.json: the training set.
- images_sample.zip: listing images organized by listing_id (a sample of 100 listings)
- Kaggle-renthop.7z: listing images organized by listing_id. Total size: 78.5 GB compressed.

In [2]:
# import libraries
import numpy as np
import pandas as pd
from collections import Counter
import re
import os
import joblib

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## 1. Import dataset

In [3]:
# import the dataset (this dataset has already remove outlier)
df = pd.read_json("../data/final_dataset_image.json")
df.head(5)

Unnamed: 0,bathrooms,bedrooms,price,sentiment_label,feature_laundry in building,feature_dishwasher,feature_hardwood floors,feature_dogs allowed,feature_cats allowed,feature_doorman,...,sink_count,oven_count,refrigerator_count,toilet_count,person_count,potted plant_count,microwave_count,bottle_count,tv_count,interest_level
83181,1,1,3807,1,0,0,0,1,1,1,...,2,2,1,1,0,0,0,0,0,-1
84383,1,1,2875,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
88044,1,1,3535,1,0,0,0,1,1,1,...,0,0,0,0,14,1,0,0,0,-1
88259,1,3,5595,1,0,0,0,0,0,0,...,2,1,1,1,0,0,0,0,0,-1
88512,1,0,1875,1,0,0,0,1,1,0,...,2,1,1,0,0,1,0,2,2,-1


In [4]:
print(f"There are {len(df)} samples.")

There are 42 samples.


## 2. Data Processing

In [5]:
df = df.drop('interest_level', axis=1)

In [6]:
# split dataset into training set and test set
X = df.drop('price', axis=1)
y = df['price'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=1)

In [7]:
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 3. KNN

In [8]:
# create KNN classifier
knn = KNeighborsRegressor(n_neighbors=5)

# train the classifier
knn.fit(X_train, y_train)

# predict the response for test dataset
y_pred = knn.predict(X_test)

# evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f4439b309d0>
Traceback (most recent call last):
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


RMSE: 1684.0738305272328


In [9]:
# save model
joblib.dump(knn, 'models/knn_model_generic_img_reg.pkl')

['models/knn_model_generic_img_reg.pkl']

In [10]:
# quick test
knn_loaded = joblib.load('models/knn_model_generic_img_reg.pkl')
y_pred = knn_loaded.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')
actual_first_price = y_test.iloc[0]
print(f'Actual first price: {actual_first_price}, Predicted first price: {y_pred[0]}')

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f4439d4c1f0>
Traceback (most recent call last):
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


RMSE: 1684.0738305272328
Actual first price: 5595, Predicted first price: 2784.0


## 4. Decision Trees

In [11]:
# create Decision Tree classifier
dt = DecisionTreeRegressor()

# train the classifier
dt.fit(X_train, y_train)

# predict the response for test dataset
y_pred = dt.predict(X_test)

# evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

RMSE: 1897.775013008655


In [12]:
# save model
joblib.dump(dt, 'models/dt_model_generic_img_reg.pkl')

['models/dt_model_generic_img_reg.pkl']

In [13]:
# quick test
dt_loaded = joblib.load('models/dt_model_generic_img_reg.pkl')
y_pred = dt_loaded.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')
actual_first_price = y_test.iloc[0]
print(f'Actual first price: {actual_first_price}, Predicted first price: {y_pred[0]}')

RMSE: 1897.775013008655
Actual first price: 5595, Predicted first price: 2450.0


## 5. SVM

In [14]:
# create SVM classifier
svm = SVR()

# train the classifier
svm.fit(X_train, y_train)

# predict the response for test dataset
y_pred = svm.predict(X_test)

# evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

RMSE: 1793.7383441555658


In [15]:
# save model
joblib.dump(svm, 'models/svm_model_generic_img_reg.pkl')

['models/svm_model_generic_img_reg.pkl']

In [16]:
# quick test
svm_loaded = joblib.load('models/svm_model_generic_img_reg.pkl')
y_pred = svm_loaded.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')
actual_first_price = y_test.iloc[0]
print(f'Actual first price: {actual_first_price}, Predicted first price: {y_pred[0]}')

RMSE: 1793.7383441555658
Actual first price: 5595, Predicted first price: 2894.3630602069325


## 6. Multi-layer Perceptron

In [17]:
# create MLP classifier
mlp = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

# train the classifier
mlp.fit(X_train, y_train)

# predict the response for test dataset
y_pred = mlp.predict(X_test)

# evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

RMSE: 3659.6791975995716


In [18]:
# save model
joblib.dump(mlp, 'models/mlp_model_generic_img_reg.pkl')

['models/mlp_model_generic_img_reg.pkl']

In [19]:
# quick test
mlp_loaded = joblib.load('models/mlp_model_generic_img_reg.pkl')
y_pred = mlp_loaded.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')
actual_first_price = y_test.iloc[0]
print(f'Actual first price: {actual_first_price}, Predicted first price: {y_pred[0]}')

RMSE: 3659.6791975995716
Actual first price: 5595, Predicted first price: 3774.1122548780554


## 7. Random Forest

In [20]:
# create Random Forest classifier
rf = RandomForestRegressor(n_estimators=10)

# train the classifier
rf.fit(X_train, y_train)

# predict the response for test dataset
y_pred = rf.predict(X_test)

# evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')

RMSE: 1498.7767512208081


In [21]:
# save model
joblib.dump(mlp, 'models/rf_model_generic_img_reg.pkl')

['models/rf_model_generic_img_reg.pkl']

In [22]:
# quick test
rf_loaded = joblib.load('models/rf_model_generic_img_reg.pkl')
y_pred = rf_loaded.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'RMSE: {rmse}')
actual_first_price = y_test.iloc[0]
print(f'Actual first price: {actual_first_price}, Predicted first price: {y_pred[0]}')

RMSE: 3659.6791975995716
Actual first price: 5595, Predicted first price: 3774.1122548780554
