# Machine Learning Modeling
- Name: Minh T. Nguyen
- Date: 11/24/2023
- About:
    - Interest level prediction with classical ML model on the dataset (without image feature extraction)
    - For the sake of simplicity, we will only split the dataset into train/test. No need for grid-search, validation set, or k-fold validation for this project. For model comparison, we will only check for test accuracy for simplicity.
    - Models to consider:
        - KNN
        - Neural Networks
        - SVM
        - Decision Tree
        - Random Forests

In [1]:
!ls ../data

final_dataset_image.json     sentimental_extraction_kaggle.csv
final_dataset_no_image.json  sentimental_extraction_kaggle.json
images_sample		     sentimental_extraction_sample.csv
Kaggle-renthop.torrent	     train.json


**Note:** The datasets can be found [here]((https://www.kaggle.com/competitions/two-sigma-connect-rental-listing-inquiries/data?select=train.json.zip)).
- train.json: the training set.
- images_sample.zip: listing images organized by listing_id (a sample of 100 listings)
- Kaggle-renthop.7z: listing images organized by listing_id. Total size: 78.5 GB compressed.

In [2]:
# import libraries
import numpy as np
import pandas as pd
from collections import Counter
import re
import os
import joblib

# sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

import warnings
warnings.filterwarnings('ignore')

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


## 1. Import dataset

In [3]:
# import the dataset (this dataset has already remove outlier)
df = pd.read_json("../data/final_dataset_no_image.json")
df.head(5)

Unnamed: 0,bathrooms,bedrooms,price,sentiment_label,feature_laundry in building,feature_dishwasher,feature_hardwood floors,feature_dogs allowed,feature_cats allowed,feature_doorman,feature_elevator,feature_no fee,feature_fitness center,interest_level
4,1.0,1,2400,1,1,1,1,1,1,0,0,0,0,0
6,1.0,2,3800,1,1,1,1,0,0,1,1,1,0,-1
9,1.0,2,3495,1,1,1,1,0,0,1,1,0,0,0
10,1.5,3,3000,0,0,0,0,0,0,0,0,0,0,0
15,1.0,0,2795,0,1,0,0,0,0,1,1,0,1,-1


In [4]:
print(f"There are {len(df)} samples.")

There are 48871 samples.


## 2. Data Processing

In [5]:
# split dataset into training set and test set
X = df.drop('interest_level', axis=1)
y = df['interest_level'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)  # 80% training and 20% test

In [6]:
# feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 3. KNN

In [7]:
# create KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# train the classifier
knn.fit(X_train, y_train)

# predict the response for test dataset
y_pred = knn.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.6814895648615469
Full Report:
              precision    recall  f1-score   support

          -1       0.75      0.88      0.81      5082
           0       0.38      0.26      0.31      1690
           1       0.40      0.13      0.19       559

    accuracy                           0.68      7331
   macro avg       0.51      0.42      0.44      7331
weighted avg       0.64      0.68      0.65      7331



In [8]:
# save model
joblib.dump(knn, 'models/knn_model_generic_cls.pkl')

['models/knn_model_generic_cls.pkl']

In [9]:
# quick test
knn_loaded = joblib.load('models/knn_model_generic_cls.pkl')
y_pred = knn_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6814895648615469
              precision    recall  f1-score   support

          -1       0.75      0.88      0.81      5082
           0       0.38      0.26      0.31      1690
           1       0.40      0.13      0.19       559

    accuracy                           0.68      7331
   macro avg       0.51      0.42      0.44      7331
weighted avg       0.64      0.68      0.65      7331



## 4. Decision Trees

In [10]:
# create Decision Tree classifier
dt = DecisionTreeClassifier()

# train the classifier
dt.fit(X_train, y_train)

# predict the response for test dataset
y_pred = dt.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.6655299413449735
Full Report:
              precision    recall  f1-score   support

          -1       0.77      0.84      0.80      5082
           0       0.36      0.30      0.33      1690
           1       0.29      0.20      0.23       559

    accuracy                           0.67      7331
   macro avg       0.47      0.45      0.45      7331
weighted avg       0.64      0.67      0.65      7331



In [11]:
# save model
joblib.dump(dt, 'models/dt_model_generic_cls.pkl')

['models/dt_model_generic_cls.pkl']

In [12]:
# quick test
dt_loaded = joblib.load('models/dt_model_generic_cls.pkl')
y_pred = dt_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6655299413449735
              precision    recall  f1-score   support

          -1       0.77      0.84      0.80      5082
           0       0.36      0.30      0.33      1690
           1       0.29      0.20      0.23       559

    accuracy                           0.67      7331
   macro avg       0.47      0.45      0.45      7331
weighted avg       0.64      0.67      0.65      7331



## 5. SVM

In [13]:
# create SVM classifier
svm = SVC()

# train the classifier
svm.fit(X_train, y_train)

# predict the response for test dataset
y_pred = svm.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.6939026053744374
Full Report:
              precision    recall  f1-score   support

          -1       0.70      0.99      0.82      5082
           0       0.39      0.04      0.08      1690
           1       0.00      0.00      0.00       559

    accuracy                           0.69      7331
   macro avg       0.36      0.34      0.30      7331
weighted avg       0.58      0.69      0.59      7331



In [14]:
# save model
joblib.dump(svm, 'models/svm_model_generic_cls.pkl')

['models/svm_model_generic_cls.pkl']

In [15]:
# quick test
svm_loaded = joblib.load('models/svm_model_generic_cls.pkl')
y_pred = svm_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6939026053744374
              precision    recall  f1-score   support

          -1       0.70      0.99      0.82      5082
           0       0.39      0.04      0.08      1690
           1       0.00      0.00      0.00       559

    accuracy                           0.69      7331
   macro avg       0.36      0.34      0.30      7331
weighted avg       0.58      0.69      0.59      7331



## 6. Multi-layer Perceptron

In [16]:
# create MLP classifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

# train the classifier
mlp.fit(X_train, y_train)

# predict the response for test dataset
y_pred = mlp.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.7031782839994544
Full Report:
              precision    recall  f1-score   support

          -1       0.74      0.95      0.83      5082
           0       0.40      0.14      0.21      1690
           1       0.48      0.13      0.20       559

    accuracy                           0.70      7331
   macro avg       0.54      0.41      0.41      7331
weighted avg       0.64      0.70      0.64      7331



In [17]:
# save model
joblib.dump(mlp, 'models/mlp_model_generic_cls.pkl')

['models/mlp_model_generic_cls.pkl']

In [18]:
# quick test
mlp_loaded = joblib.load('models/mlp_model_generic_cls.pkl')
y_pred = mlp_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7031782839994544
              precision    recall  f1-score   support

          -1       0.74      0.95      0.83      5082
           0       0.40      0.14      0.21      1690
           1       0.48      0.13      0.20       559

    accuracy                           0.70      7331
   macro avg       0.54      0.41      0.41      7331
weighted avg       0.64      0.70      0.64      7331



## 7. Random Forest

In [19]:
# create Random Forest classifier
rf = RandomForestClassifier(n_estimators=10)

# train the classifier
rf.fit(X_train, y_train)

# predict the response for test dataset
y_pred = rf.predict(X_test)

# evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Full Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.6738507706997681
Full Report:
              precision    recall  f1-score   support

          -1       0.77      0.85      0.81      5082
           0       0.38      0.31      0.34      1690
           1       0.33      0.23      0.27       559

    accuracy                           0.67      7331
   macro avg       0.49      0.46      0.47      7331
weighted avg       0.65      0.67      0.66      7331



In [20]:
# save model
joblib.dump(mlp, 'models/rf_model_generic_cls.pkl')

['models/rf_model_generic_cls.pkl']

In [21]:
# quick test
rf_loaded = joblib.load('models/rf_model_generic_cls.pkl')
y_pred = rf_loaded.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7031782839994544
              precision    recall  f1-score   support

          -1       0.74      0.95      0.83      5082
           0       0.40      0.14      0.21      1690
           1       0.48      0.13      0.20       559

    accuracy                           0.70      7331
   macro avg       0.54      0.41      0.41      7331
weighted avg       0.64      0.70      0.64      7331

