In [1]:
import os
import sys
sys.path.append(os.pardir)

In [2]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from credible import connectors

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

### Parameters

In [4]:
engine = connectors.connect_to_sqlite()

In [5]:
%%time
businesses = pd.read_sql_table('businesses', engine)
users = pd.read_sql_table('users', engine)
reviews = pd.read_sql_table('reviews', engine)
users_meta = pd.read_sql_table('users_meta', engine)
reviews_meta = pd.read_sql_table('reviews_meta', engine)

CPU times: user 1min 56s, sys: 36.8 s, total: 2min 33s
Wall time: 3min 46s


### Dataframe

In [6]:
df = reviews.merge(
    reviews_meta, how='left', on='review_id').merge(
        users_meta, how='left', on='user_id')

In [7]:
# df.drop('text', axis=1, inplace=True)
df = df.iloc[:50000,:]

In [8]:
df.shape

(50000, 15)

In [9]:
features_continous = df.loc[:, ['stars', 'days_past', 'text_length', 'useful', 'funny', 'cool',
       'text_length_category', 'num_of_friends']]
features_categorical = df.loc[:, ['stars', 'text_length_category']]

In [10]:
features_categorical.sample(2)

Unnamed: 0,stars,text_length_category
49474,3,3
28212,4,5


In [11]:
features_continous.sample(2)

Unnamed: 0,stars,days_past,text_length,useful,funny,cool,text_length_category,num_of_friends
29829,4,1032,685,0,0,0,7,1
21436,1,3123,313,2,0,0,3,1


## Preprocessing

In [12]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

category_list = [f'stars_{i}' for i in range(1, 6)] + [f'textlen_{i}' for i in range(1, 11)]

scaler = MinMaxScaler()
onehot = OneHotEncoder(categories='auto', sparse=False)

values_continuous = scaler.fit_transform(features_continous)
values_categorical = onehot.fit_transform(features_categorical)

values_continuous.shape, values_categorical.shape

((50000, 8), (50000, 15))

In [13]:
X = np.concatenate((values_continuous, values_categorical), axis=1)
X.shape

(50000, 23)

In [14]:
X[:1]

array([[0.        , 0.40681727, 0.04060812, 0.05940594, 0.02173913,
        0.        , 0.11111111, 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

### LocalOutlierFactor

In [73]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=1000, p=1, contamination='auto', metric='manhattan')
y_pred = lof.fit_predict(X)

In [74]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

In [75]:
scores = lof.negative_outlier_factor_
-scores

array([1.0147907 , 0.99409215, 0.95056868, ..., 1.00107212, 0.9980287 ,
       1.20111718])

In [76]:
df['lof_scores'] = -scores
df['lof_labels'] = y_pred

In [116]:
df[df.lof_labels == -1].text_length

9          97
21        742
75       1012
80        149
88        704
         ... 
49922     120
49945     140
49961     979
49965      66
49981     451
Name: text_length, Length: 2868, dtype: int64

In [89]:
df[df.lof_labels == -1].sample(2)

Unnamed: 0,_id,review_id,business_id,user_id,stars,date,text,useful,funny,cool,newest_review_date,days_past,text_length,text_length_category,num_of_friends,lof_scores,lof_labels
24888,24889,vTmiupao21l0UQWZwpm2aw,ddplAthoA2pGLoSqhHpVkA,keBv05MsMFBd0Hu98vXThQ,5,2018-08-05 16:26:16,This place is awesome. I've come here several...,6,2,5,2018-10-13 03:50:14,68,263,2,518,2.84,-1
36816,36817,AzVLSxnVzO9Ftnp_y4SywQ,JrrY4v21k7_qYxIidhi-aw,nmbtxCvYfnqH0_ptlvoPaw,5,2007-12-18 19:36:20,I met up with 2 friends at Kellys on a Monday ...,1,0,0,2018-11-13 02:49:47,3982,285,3,37,3.54,-1


## Ensembe - IsolationForest

In [92]:
from sklearn.ensemble import IsolationForest

clf = IsolationForest(behaviour='new', max_samples=100,
                      random_state=0, contamination='auto')
clf.fit(X)

IsolationForest(behaviour='new', bootstrap=False, contamination='auto',
                max_features=1.0, max_samples=100, n_estimators=100,
                n_jobs=None, random_state=0, verbose=0, warm_start=False)

In [95]:
df['iso_labels'] = clf.predict(X)

In [96]:
df.iso_labels.value_counts()

 1    43745
-1     6255
Name: iso_labels, dtype: int64

## SVM - OneClassSVM

This is not very successful, as seen in my data. In this [article](https://scikit-learn.org/stable/modules/outlier_detection.html#overview-of-outlier-detection-methods), it says OneClassSVM is not strong for outliers.

In [113]:
from sklearn.svm import OneClassSVM

clf = OneClassSVM(gamma='auto', tol=0.005, nu=0.1, kernel="poly", degree=5, coef0=0.8)
clf.fit(X)

OneClassSVM(cache_size=200, coef0=0.8, degree=5, gamma='auto', kernel='poly',
            max_iter=-1, nu=0.1, random_state=None, shrinking=True, tol=0.005,
            verbose=False)

In [114]:
df['svm_labels'] = clf.predict(X)
df['svm_scores'] = clf.score_samples(X)

In [115]:
df.svm_labels.value_counts()

 1    44996
-1     5004
Name: svm_labels, dtype: int64

## EllipticEnvelope

In [123]:
from sklearn.covariance import EllipticEnvelope

cov = EllipticEnvelope(random_state=0)
cov.fit(X)



EllipticEnvelope(assume_centered=False, contamination=0.1, random_state=0,
                 store_precision=True, support_fraction=None)

In [124]:
df['ell_labels'] = cov.predict(X)
df['ell_scores'] = cov.score_samples(X)

In [125]:
df.ell_labels.value_counts()

 1    45000
-1     5000
Name: ell_labels, dtype: int64

## Nearest Neigbors

In [44]:
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors()
neigh.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [56]:
neigh.kneighbors(X)

(array([[0.        , 0.01507347, 0.02487194, 0.03868104, 0.04257707],
        [0.        , 0.00832885, 0.01092536, 0.01340308, 0.01555025],
        [0.        , 0.01124959, 0.01391204, 0.01419293, 0.01486884],
        ...,
        [0.        , 0.01234633, 0.01265754, 0.01648485, 0.01763921],
        [0.        , 0.00747856, 0.01000672, 0.01046258, 0.01093983],
        [0.        , 0.00196199, 0.00272172, 0.00284972, 0.00364437]]),
 array([[    0, 39253,   310,  7798,  3322],
        [    1, 48975, 27566, 24643, 40438],
        [    2, 42182, 41906, 22627, 14482],
        ...,
        [49997, 26985,  6424, 28345, 22257],
        [49998, 16169, 10141, 10722, 32570],
        [49999, 37360,  4881, 49647, 17444]]))

In [57]:
A = neigh.kneighbors_graph(X)
A.toarray().shape

(50000, 50000)

### KMeans

In [120]:
from sklearn.cluster import KMeans, MiniBatchKMeans

mbkm = MiniBatchKMeans(n_clusters=8, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000, verbose=0)
mbkm.fit(X)
np.bincount(mbkm.labels_)

array([ 5467,  6568, 11062,  6007,  3955,  7245,  5338,  4358])

In [122]:
km = KMeans(n_clusters=8, init='k-means++', max_iter=100, n_init=1,
                verbose=0)
km.fit(X)
np.bincount(km.labels_)

array([16264,  7245,  4003,  3955,  5467,  3542,  4051,  5473])

## Mean Shift

In [128]:
from sklearn.cluster import MeanShift, estimate_bandwidth

bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
ms = MeanShift(bandwidth, bin_seeding=True)
ms.fit(X)

MeanShift(bandwidth=1.6114435779092968, bin_seeding=True, cluster_all=True,
          min_bin_freq=1, n_jobs=None, seeds=None)

In [None]:
clusters = ms.predict(X)
np.bincount(clusters)

In [None]:
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)

import matplotlib.pyplot as plt
from itertools import cycle

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

## PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(
    data = principalComponents, 
    columns = ['principal component 1', 'principal component 2']
)

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.explained_variance_

In [None]:
pca.components_.shape

In [None]:
principalDf.sample(2)

In [None]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops=dict(arrowstyle='->',
                    linewidth=2,
                    shrinkA=0, shrinkB=0)
    ax.annotate('', v1, v0, arrowprops=arrowprops)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

ax.scatter(
    principalDf.loc[:, 'principal component 1']
    , principalDf.loc[:, 'principal component 2']
    , c = 'b'
    , s = 50
)

ax.grid()

## TSNE

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=17)

X_tsne = tsne.fit_transform(X)

In [None]:
plt.figure(figsize=(12,10))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c='b', 
            edgecolor='none', alpha=0.7, s=40,
            cmap=plt.cm.get_cmap('nipy_spectral', 10))
plt.colorbar()
plt.title('MNIST. t-SNE projection');

## Data Distribution

In [None]:
df.text_length.value_counts().plot()

In [None]:
df.days_past.value_counts().plot(ylim=(0,10000))

In [None]:
df.days_past.describe().to_frame()

In [None]:
df.num_of_friends.value_counts().plot(ylim=(0,500))

In [None]:
df.text_length.value_counts().plot()

In [None]:
df.useful.value_counts().plot(ylim=(0,100))

In [None]:
df.funny.value_counts().plot(ylim=(0,100))

In [None]:
df.cool.value_counts().plot(ylim=(0,100))

In [None]:
from sklearn.cluster import KMeans
neigh = KMeans(n_clusters=2, random_state=0).fit(X)

In [None]:
y_kmeans = neigh.predict(X)

In [None]:
centers = kmeans.cluster_centers_

In [None]:
X.values[:, 6]

In [None]:
plt.scatter(X.values[:, 6], X.values[:, 7], c=y_kmeans, s=50)

plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);