# Predict Outliers - Airbnb Reviews

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from pyod.models.suod import SUOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.ocsvm import OCSVM
from pyod.utils.utility import standardizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", None)

In [None]:
samples = 50782 # 20% of data
threads = 1
contamination = 0.4

In [None]:
# Load the dataset containing named entities
filename = "reviews_named_entities.csv"
pred_df = pd.read_csv(filename, sep=";", nrows=samples)

In [None]:
filename = "reviews_sample_labelled.csv"
label_df = pd.read_csv(filename, sep=";")

In [None]:
new_columns = ['CARDINAL', 'DATE', 'EMAIL', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL',
               'ORG', 'PERCENT', 'PERSON', 'PHONE', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
pred_df[new_columns].head()

## Dimension Reduction

In [None]:
pca = PCA(0.95)
pc = pca.fit_transform(pred_df[new_columns])
pc_df = pd.DataFrame(data=pc)
pc_df = pd.concat([pc_df, label_df[['label']]], axis=1)
pc_df.head()

In [None]:
# fig = plt.figure(figsize=(8,8))
# ax = fig.add_subplot(1,1,1) 
# ax.set_xlabel('PC1', fontsize = 15)
# ax.set_ylabel('PC2', fontsize = 15)
# ax.set_title('2 Component PCA', fontsize=14)
# targets = [1, 0]
# colors = ['r', 'g']
# for target, color in zip(targets, colors):
#     indices = pc_df['label'] == target
#     ax.scatter(pc_df.loc[indices, 0], pc_df.loc[indices, 1], c=color, s=50)
# ax.legend(targets)
# ax.grid()

## 5. Detect Outiers

In [None]:
feature_matrix = standardizer(pc_df.drop('label', axis=1))

In [None]:
features = feature_matrix

### 5.1 Local Outlier Factor (LOF)

In [None]:
%%timeit -r 3 -n 1
lof = LOF(n_neighbors=20, contamination=contamination, n_jobs=threads)
pred_df["lof"] = lof.fit_predict(features)
pred_df["lof_score"] = lof.decision_scores_
print(f"Threshold: {lof.threshold_:.5f}")

### 5.2 DBSCAN

In [None]:
%%timeit -r 3 -n 1
dbscan = DBSCAN(eps=0.5, min_samples=5, n_jobs=threads).fit(features)

core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
labels = dbscan.labels_

labels[labels > -1] = 0
labels[labels == -1] = 1
pred_df["dbscan"] = labels

### 5.3 Isolation Forest (iForest)

In [None]:
%%timeit -r 3 -n 1
iforest = IForest(n_estimators=100, contamination=contamination, max_samples="auto", max_features=1.0, n_jobs=threads)
iforest.fit(features)
pred_df["iforest"] = iforest.predict(features)
pred_df["iforest_score"] = iforest.decision_scores_
print(f"Threshold: {iforest.threshold_:.5f}")

### 5.4 One Class SVM (OCSVM)

In [None]:
%%timeit -r 3 -n 1
ocsvm = OCSVM(gamma="auto", contamination=contamination, cache_size=2048)
ocsvm.fit(features) 
pred_df["ocsvm"] = ocsvm.predict(features)
pred_df["ocsvm_score"] = ocsvm.decision_scores_
print(f"Threshold: {ocsvm.threshold_:.5f}")

### 5.5 SUOD

In [None]:
%%timeit -r 3 -n 1
detector_list = [
    LOF(n_neighbors=20, contamination=contamination),
    IForest(n_estimators=100, contamination=contamination, max_samples="auto", max_features=1.0),
    OCSVM(gamma="auto", contamination=contamination, cache_size=2048)
]
suod = SUOD(base_estimators=detector_list, contamination=contamination, n_jobs=threads, combination='average', verbose=False)
suod.fit(features)
pred_df["suod"] = suod.labels_
pred_df["suod_score"] = suod.decision_scores_
print(f"Threshold: {suod.threshold_:.5f}")

In [None]:
filename_pred = "reviews_outlier_predtions.csv"
pred_df.to_csv(filename_pred, sep=";", index=False)

In [None]:
# All thread
# 2.32 s ± 92.9 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 3.74 s ± 346 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 4.2 s ± 109 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 7min 2s ± 24.8 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 4min 46s ± 16.6 s per loop (mean ± std. dev. of 3 runs, 1 loop each)

# Single thread
# 3.51 s ± 274 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 8.54 s ± 799 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 5.23 s ± 485 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 7min 23s ± 8.99 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
# 5min 32s ± 1min 19s per loop (mean ± std. dev. of 3 runs, 1 loop each)

In [None]:
pred_df[["ents", "lof", "dbscan", "iforest", "ocsvm", "suod"]].head(30) 