In [1]:
import os
import sys
sys.path.append(os.pardir)

In [2]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from credible import connectors

In [3]:
pd.options.display.float_format = '{:,.2f}'.format

### Parameters

In [4]:
engine = connectors.connect_to_sqlite()

In [5]:
%%time
businesses = pd.read_sql_table('businesses', engine)
users = pd.read_sql_table('users', engine)
reviews = pd.read_sql_table('reviews', engine)
users_meta = pd.read_sql_table('users_meta', engine)
reviews_meta = pd.read_sql_table('reviews_meta', engine)

CPU times: user 1min 53s, sys: 37.7 s, total: 2min 30s
Wall time: 3min 56s


### Dataframe

In [34]:
df = reviews.merge(
    reviews_meta, how='left', on='review_id').merge(
        users_meta, how='left', on='user_id')

In [39]:
df.drop('text', axis=1, inplace=True)
df = df[df.useful != 0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [40]:
df.shape

(3115447, 14)

In [41]:
df.head(2)

Unnamed: 0,_id,review_id,business_id,user_id,stars,date,useful,funny,cool,newest_review_date,days_past,text_length,text_length_category,num_of_friends
0,1,Q1sbwvVQXV2734tPgoKj4Q,ujmEBvifdJM6h6RLv4wQIg,hG7b0MtEbXx5QzbzE6C_VA,1,2013-05-07 04:34:36,6,1,0,2018-11-14 06:12:10,2017,204,1,1
2,3,2TzJjDVDEuAW6MR5Vuc1ug,WTqjgwHlXbSFevF32_DJVw,n6-Gk65cPZL6Uz8qRm3NYw,5,2016-11-09 20:09:03,3,0,0,2018-09-11 20:29:15,671,615,6,2


In [45]:
df['fake_potential'] = df.useful < 5

In [46]:
df.fake_potential.value_counts()

True     2651253
False     464194
Name: fake_potential, dtype: int64

## Preprocessing

In [9]:
features_continous = df.loc[:, ['stars', 'days_past', 'text_length', 'useful', 'funny', 'cool',
       'text_length_category', 'num_of_friends']]
features_categorical = df.loc[:, ['stars', 'text_length_category']]

In [10]:
features_categorical.sample(2)

Unnamed: 0,stars,text_length_category
730557,4,9
898712,5,6


In [11]:
features_continous.sample(2)

Unnamed: 0,stars,days_past,text_length,useful,funny,cool,text_length_category,num_of_friends
184971,5,984,279,0,0,0,3,2
907663,1,0,850,0,0,0,7,53


In [12]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

category_list = [f'stars_{i}' for i in range(1, 6)] + [f'textlen_{i}' for i in range(1, 11)]

scaler = MinMaxScaler()
onehot = OneHotEncoder(categories='auto', sparse=False)

values_continuous = scaler.fit_transform(features_continous)
values_categorical = onehot.fit_transform(features_categorical)

values_continuous.shape, values_categorical.shape

((1000000, 8), (1000000, 15))

In [13]:
X = np.concatenate((values_continuous, values_categorical), axis=1)
X.shape

(1000000, 23)

## Supervised