# Model Hyperparameters Optimization - Airbnb Reviews

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from pyod.models.suod import SUOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.ocsvm import OCSVM
from pyod.utils.utility import standardizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_curve, roc_auc_score
from sklearn.decomposition import PCA

pd.set_option("display.max_colwidth", None)

In [None]:
samples = 50782 # 20% of data
threads = -1
contamination = 0.4

In [None]:
filename = "reviews_named_entities.csv"
df_pred = pd.read_csv(filename, sep=";", nrows=samples)

In [None]:
filename = "reviews_sample_labelled.csv"
df_label = pd.read_csv(filename, sep=";", nrows=samples)

In [None]:
new_columns = ['CARDINAL', 'DATE', 'EMAIL', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL',
               'ORG', 'PERCENT', 'PERSON', 'PHONE', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
df_pred[new_columns].head()

In [None]:
pca = PCA(0.95)
pc = pca.fit_transform(df_pred[new_columns])
pc_df = pd.DataFrame(data=pc)
pc_df = pd.concat([pc_df, df_label['label']], axis=1)
pc_df.head()

In [None]:
feature_matrix = standardizer(pc_df.drop('label', axis=1))

## Local Outlier Factor (LOF)

In [None]:
parameters = {
    'n_neighbors': np.arange(900, 2000, 50),
    'contamination': [contamination]
}
clf = GridSearchCV(LOF(), parameters, scoring='roc_auc', n_jobs=threads, verbose=4)
clf.fit(feature_matrix, df_label["label"])
print(clf.best_params_)

lof = LOF(**clf.best_params_, n_jobs=threads)
lof_pred = lof.fit_predict(feature_matrix)
lof_score = lof.decision_scores_
lof_auc = roc_auc_score(df_label["label"], lof_score)
print(f"ROC AUC: {lof_auc:.3f}, Threshold: {lof.threshold_:.5f}")

# Fitting 5 folds for each of 22 candidates, totalling 110 fits
# {'contamination': 0.4, 'n_neighbors': 1400}
# ROC AUC: 0.809, Threshold: 1.03282

## DBSCAN

In [None]:
parameters = {
    'eps': np.arange(0.1, 1.0, 0.1),
    'min_samples': np.arange(3, 11, 1)
}
clf = GridSearchCV(DBSCAN(), parameters, scoring='roc_auc', n_jobs=threads, verbose=4)
clf.fit(feature_matrix, df_label["label"])
print(clf.best_params_)

dbscan = DBSCAN(**clf.best_params_, n_jobs=threads)
dbscan_pred = dbscan.fit_predict(feature_matrix)
dbscan_auc = roc_auc_score(df_label["label"], dbscan_pred)
print(f"ROC AUC: {dbscan_auc:.3f}")

# Fitting 5 folds for each of 72 candidates, totalling 360 fits
# {'eps': 0.1, 'min_samples': 3}
# ROC AUC: 0.684

## iForest

In [None]:
parameters = {
    'n_estimators': np.arange(50, 201, 25),
    'max_samples': [0.25, 0.5, 0.75, 1.0],
    'max_features': [0.05, 0.15, 0.25, 0.5, 0.75, 1.0],
    'contamination': [contamination]
}
clf = GridSearchCV(IForest(), parameters, scoring='roc_auc', n_jobs=threads, verbose=4)
clf.fit(feature_matrix, df_label["label"])
print(clf.best_params_) 

iforest = IForest(**clf.best_params_, n_jobs=threads)
iforest_pred = iforest.fit_predict(feature_matrix)
iforest_score = iforest.decision_scores_
iforest_auc = roc_auc_score(df_label["label"], iforest_score)
print(f"ROC AUC: {iforest_auc:.3f}, Threshold: {iforest.threshold_:.5f}")

# Fitting 5 folds for each of 168 candidates, totalling 840 fits
# {'contamination': 0.4, 'max_features': 0.05, 'max_samples': 0.75, 'n_estimators': 100}
# ROC AUC: 0.880, Threshold: -0.00000

## OCSVM

In [None]:
parameters = {
    'kernel': ['linear', 'rbf'],
    'cache_size': [2048],
    'contamination': [0.4]
}
clf = GridSearchCV(OCSVM(), parameters, scoring='roc_auc', n_jobs=threads, verbose=4)
clf.fit(feature_matrix, df_label["label"])
print(clf.best_params_)

ocsvm = OCSVM(**clf.best_params_, n_jobs=threads)
ocsvm_pred = ocsvm.fit_predict(feature_matrix)
ocsvm_score = ocsvm.decision_scores_
ocsvm_auc = roc_auc_score(df_label["label"], ocsvm_score)
print(f"ROC AUC: {ocsvm_auc:.3f}, Threshold: {ocsvm.threshold_:.5f}")