# Machine Learning Modeling
- Name: Minh T. Nguyen
- Date: 11/24/2023
- About:
    - Interest level prediction with classical ML model on the dataset with image feature extraction
    - For the sake of simplicity, we will only split the dataset into train/test. No need for grid-search, validation set, or k-fold validation for this project. For model comparison, we will only check for test accuracy for simplicity.
    - Note that the models' results are not reliable as the dataset with image features are heavy skewed to "high" interest levels compared to "low" and "normal".
    - Models to consider:
        - KNN
        - Neural Networks
        - SVM
        - Decision Tree
        - Random Forests

In [1]:
!ls ../data

final_dataset_image.json     sentimental_extraction_kaggle.csv
final_dataset_no_image.json  sentimental_extraction_kaggle.json
images_sample		     sentimental_extraction_sample.csv
Kaggle-renthop.torrent	     train.json


**Note:** The datasets can be found [here]((https://www.kaggle.com/competitions/two-sigma-connect-rental-listing-inquiries/data?select=train.json.zip)).
- train.json: the training set.
- images_sample.zip: listing images organized by listing_id (a sample of 100 listings)
- Kaggle-renthop.7z: listing images organized by listing_id. Total size: 78.5 GB compressed.

In [2]:
# import libraries
import numpy as np
import pandas as pd
from collections import Counter
import re
import os
import joblib

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## 1. Import dataset

In [3]:
# import the dataset (this dataset has already remove outlier)
df = pd.read_json("../data/final_dataset_image.json")
df.head(5)

Unnamed: 0,bathrooms,bedrooms,price,sentiment_label,feature_laundry in building,feature_dishwasher,feature_hardwood floors,feature_dogs allowed,feature_cats allowed,feature_doorman,...,sink_count,oven_count,refrigerator_count,toilet_count,person_count,potted plant_count,microwave_count,bottle_count,tv_count,interest_level
83181,1,1,3807,1,0,0,0,1,1,1,...,2,2,1,1,0,0,0,0,0,-1
84383,1,1,2875,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,-1
88044,1,1,3535,1,0,0,0,1,1,1,...,0,0,0,0,14,1,0,0,0,-1
88259,1,3,5595,1,0,0,0,0,0,0,...,2,1,1,1,0,0,0,0,0,-1
88512,1,0,1875,1,0,0,0,1,1,0,...,2,1,1,0,0,1,0,2,2,-1


In [4]:
print(f"There are {len(df)} samples.")

There are 42 samples.


## 2. Data Processing

In [5]:
# split dataset into training set and test set
X = df.drop('interest_level', axis=1)
y = df['interest_level'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=1)

In [6]:
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 3. KNN

In [7]:
# create KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# train the classifier
knn.fit(X_train, y_train)

# predict the response for test dataset
y_pred = knn.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f84f9b0e670>
Traceback (most recent call last):
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


Test Accuracy: 1.0
Full Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [8]:
# save model
joblib.dump(knn, 'models/knn_model_generic_img_cls.pkl')

['models/knn_model_generic_img_cls.pkl']

In [9]:
# quick test
knn_loaded = joblib.load('models/knn_model_generic_img_cls.pkl')
y_pred = knn_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f84f9b34af0>
Traceback (most recent call last):
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/mnguyen0226/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


Accuracy: 1.0
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



## 4. Decision Trees

In [10]:
# create Decision Tree classifier
dt = DecisionTreeClassifier()

# train the classifier
dt.fit(X_train, y_train)

# predict the response for test dataset
y_pred = dt.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.6666666666666666
Full Report:
              precision    recall  f1-score   support

          -1       1.00      0.67      0.80         3
           0       0.00      0.00      0.00         0

    accuracy                           0.67         3
   macro avg       0.50      0.33      0.40         3
weighted avg       1.00      0.67      0.80         3



In [11]:
# save model
joblib.dump(dt, 'models/dt_model_generic_img_cls.pkl')

['models/dt_model_generic_img_cls.pkl']

In [12]:
# quick test
dt_loaded = joblib.load('models/dt_model_generic_img_cls.pkl')
y_pred = dt_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

          -1       1.00      0.67      0.80         3
           0       0.00      0.00      0.00         0

    accuracy                           0.67         3
   macro avg       0.50      0.33      0.40         3
weighted avg       1.00      0.67      0.80         3



## 5. SVM

In [13]:
# create SVM classifier
svm = SVC()

# train the classifier
svm.fit(X_train, y_train)

# predict the response for test dataset
y_pred = svm.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 1.0
Full Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [14]:
# save model
joblib.dump(svm, 'models/svm_model_generic_img_cls.pkl')

['models/svm_model_generic_img_cls.pkl']

In [15]:
# quick test
svm_loaded = joblib.load('models/svm_model_generic_img_cls.pkl')
y_pred = svm_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



## 6. Multi-layer Perceptron

In [16]:
# create MLP classifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

# train the classifier
mlp.fit(X_train, y_train)

# predict the response for test dataset
y_pred = mlp.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.6666666666666666
Full Report:
              precision    recall  f1-score   support

          -1       1.00      0.67      0.80         3
           0       0.00      0.00      0.00         0

    accuracy                           0.67         3
   macro avg       0.50      0.33      0.40         3
weighted avg       1.00      0.67      0.80         3



In [17]:
# save model
joblib.dump(mlp, 'models/mlp_model_generic_img_cls.pkl')

['models/mlp_model_generic_img_cls.pkl']

In [18]:
# quick test
mlp_loaded = joblib.load('models/mlp_model_generic_img_cls.pkl')
y_pred = mlp_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

          -1       1.00      0.67      0.80         3
           0       0.00      0.00      0.00         0

    accuracy                           0.67         3
   macro avg       0.50      0.33      0.40         3
weighted avg       1.00      0.67      0.80         3



## 7. Random Forest

In [19]:
# create Random Forest classifier
rf = RandomForestClassifier(n_estimators=10)

# train the classifier
rf.fit(X_train, y_train)

# predict the response for test dataset
y_pred = rf.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 1.0
Full Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00         3

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



In [20]:
# save model
joblib.dump(mlp, 'models/rf_model_generic_img_cls.pkl')

['models/rf_model_generic_img_cls.pkl']

In [21]:
# quick test
rf_loaded = joblib.load('models/rf_model_generic_img_cls.pkl')
y_pred = rf_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

          -1       1.00      0.67      0.80         3
           0       0.00      0.00      0.00         0

    accuracy                           0.67         3
   macro avg       0.50      0.33      0.40         3
weighted avg       1.00      0.67      0.80         3

