In [1]:
import os
import sys
sys.path.append(os.pardir)

In [2]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from credible import connectors

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

### Parameters

In [4]:
engine = connectors.connect_to_sqlite()

In [5]:
%%time
businesses = pd.read_sql_table('businesses', engine)
users = pd.read_sql_table('users', engine)
reviews = pd.read_sql_table('reviews', engine)
users_meta = pd.read_sql_table('users_meta', engine)
reviews_meta = pd.read_sql_table('reviews_meta', engine)

CPU times: user 2min 7s, sys: 41.5 s, total: 2min 49s
Wall time: 3min 57s


### Dataframe

In [6]:
df = reviews.merge(
    reviews_meta, how='left', on='review_id').merge(
        users_meta, how='left', on='user_id')

### Creating A Label

In [7]:
df = df[df.useful != 0]
df.shape

(3115447, 15)

In [10]:
# df.drop('text', axis=1, inplace=True)
df = df.iloc[:50000,:]
df.shape

(50000, 15)

### Creating a Label

In [11]:
df.head(2)

Unnamed: 0,_id,review_id,business_id,user_id,stars,date,text,useful,funny,cool,newest_review_date,days_past,text_length,text_length_category,num_of_friends
0,1,Q1sbwvVQXV2734tPgoKj4Q,ujmEBvifdJM6h6RLv4wQIg,hG7b0MtEbXx5QzbzE6C_VA,1,2013-05-07 04:34:36,Total bill for this horrible service? Over $8G...,6,1,0,2018-11-14 06:12:10,2017,204,1,1
2,3,2TzJjDVDEuAW6MR5Vuc1ug,WTqjgwHlXbSFevF32_DJVw,n6-Gk65cPZL6Uz8qRm3NYw,5,2016-11-09 20:09:03,I have to say that this office really has it t...,3,0,0,2018-09-11 20:29:15,671,615,6,2


In [12]:
df['fake_potential'] = df.useful < 5

In [13]:
df.fake_potential.value_counts()

True     42630
False     7370
Name: fake_potential, dtype: int64

## Preprocessing

In [14]:
features_continous = df.loc[:, ['stars', 'days_past', 'text_length', 'useful', 'funny', 'cool',
       'text_length_category', 'num_of_friends']]
features_categorical = df.loc[:, ['stars', 'text_length_category']]

In [15]:
features_categorical.sample(2)

Unnamed: 0,stars,text_length_category
29087,4,9
2094,5,3


In [16]:
features_continous.sample(2)

Unnamed: 0,stars,days_past,text_length,useful,funny,cool,text_length_category,num_of_friends
82534,4,2717,723,8,2,2,7,1092
106061,5,445,1015,1,1,0,8,158


In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

category_list = [f'stars_{i}' for i in range(1, 6)] + [f'textlen_{i}' for i in range(1, 11)]

scaler = MinMaxScaler()
onehot = OneHotEncoder(categories='auto', sparse=False)

values_continuous = scaler.fit_transform(features_continous)
values_categorical = onehot.fit_transform(features_categorical)

values_continuous.shape, values_categorical.shape

((50000, 8), (50000, 15))

### X and y

In [18]:
X = np.concatenate((values_continuous, values_categorical), axis=1)
X.shape

(50000, 23)

In [19]:
y = df.fake_potential
y.shape

(50000,)

### Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((33500, 23), (16500, 23))

## Supervised

In [22]:
from sklearn.svm import SVC

clf = SVC(gamma='auto')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
y_pred = clf.predict(X_test)

In [31]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(classification_report(
    y_test, y_pred, target_names=('fake', 'true')))

              precision    recall  f1-score   support

        fake       0.99      0.33      0.50      2430
        true       0.90      1.00      0.95     14070

    accuracy                           0.90     16500
   macro avg       0.94      0.67      0.72     16500
weighted avg       0.91      0.90      0.88     16500



In [30]:
confusion_matrix(y_test, y_pred)

array([[  811,  1619],
       [    6, 14064]])