In [1]:
import os
import sys
sys.path.append(os.pardir)

In [2]:
import datetime
import pandas as pd
from sklearn import preprocessing
from credible import connectors

In [3]:
engine = connectors.connect_to_sqlite()

In [4]:
%%time
users_raw = pd.read_sql_table('users', engine)
reviews_raw = pd.read_sql_table('reviews', engine)

CPU times: user 1min 15s, sys: 31.7 s, total: 1min 47s
Wall time: 3min 10s


In [19]:
%%time
# for debug purposes
users = users_raw.sample(500000)
reviews = reviews_raw.sample(500000)

CPU times: user 1.9 s, sys: 732 ms, total: 2.63 s
Wall time: 2.63 s


In [20]:
%%time
reviews.date = pd.to_datetime(reviews.date)
reviews['_date'] = reviews.date.dt.date
reviews['_time'] = reviews.date.dt.time

CPU times: user 758 ms, sys: 19.1 ms, total: 777 ms
Wall time: 777 ms


## EDA on Reviews

In [21]:
reviews.sample(2)

Unnamed: 0,_id,review_id,business_id,user_id,stars,date,text,useful,funny,cool,_date,_time
224652,224653,wP7T94jSLADqkl3ULdRVFA,2mdF-c66TY3eSyTVI4GhQg,-ZqPTpx9TyyvdAWbajdilA,4,2015-12-13 03:31:10,This was a much different store the last time ...,4,1,2,2015-12-13,03:31:10
6062553,6062554,kzL3pRiV3jtu8YvXo4GVvg,3kdSl5mo9dWC4clrQjEDGg,cctzuSXnQgHsv2KIVX8Qug,5,2016-11-10 19:07:00,Best Hawaiian burger steak in vegas! Loco moco...,0,0,0,2016-11-10,19:07:00


In [22]:
reviews.columns

Index(['_id', 'review_id', 'business_id', 'user_id', 'stars', 'date', 'text',
       'useful', 'funny', 'cool', '_date', '_time'],
      dtype='object')

## EDA on Users

In [23]:
users.sample(2)

Unnamed: 0,_id,user_id,name,review_count,yelping_since,friends,useful,funny,cool,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
440585,440586,qwlsMuXSDuORY9Ep0K-fGQ,John,28,2011-02-22 21:30:30,,48,4,7,0,...,0,0,0,0,0,0,0,0,0,0
898730,898731,trbZPYf2ExgvsgnK5XhX9A,J,9,2010-01-08 17:52:23,"2SZlDB2BBvmuEoNRTz2qUQ, omBPSU97G8fSvsoQyMULwg...",17,2,8,2,...,0,0,0,0,0,1,0,0,0,0


In [24]:
users.columns

Index(['_id', 'user_id', 'name', 'review_count', 'yelping_since', 'friends',
       'useful', 'funny', 'cool', 'fans', 'elite', 'average_stars',
       'compliment_hot', 'compliment_more', 'compliment_profile',
       'compliment_cute', 'compliment_list', 'compliment_note',
       'compliment_plain', 'compliment_cool', 'compliment_funny',
       'compliment_writer', 'compliment_photos'],
      dtype='object')

## Create User Features

In [25]:
# derived columns to be inserted to database
df = pd.DataFrame()
df['user_id'] = users.user_id
df.sample(2)

Unnamed: 0,user_id
120917,6m6a0Gd9x1mLkJv3MsSYkg
692368,Xed0Ea8d4-zbhcbgbzcNgw


### Number of Friends

In [26]:
df['num_of_friends'] = users.friends.apply(lambda x: len(x.split(',')))

In [27]:
df.sample(2)

Unnamed: 0,user_id,num_of_friends
702948,0WLejTMJftjhALddHD7fqw,12
1484714,MXl5XwQETCuWITY1lED8Bg,4


### Maximum Number of Reviews (In a day)

In [28]:
df_g_user_date = reviews.groupby(
    by=['user_id', '_date']).agg(
        {'_id': 'count', '_time': list, 'business_id': list, 'review_id': list})

In [37]:
df_g_user_date['count_business'] = df_g_user_date.business_id.apply(lambda x: len(set(x)))

In [68]:
df_g_user_date[
    (df_g_user_date.index.get_level_values('user_id') == 'zzPvEodjvLPe-5NvV0U_cg')]

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,_time,business_id,review_id,count_business,count_review
user_id,_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
zzPvEodjvLPe-5NvV0U_cg,2011-05-10,1,[04:40:04],[K7lWdNUhCbcnEvI0NhGewg],[lRLXvYVitpvJD6Eucipb4A],1,1
zzPvEodjvLPe-5NvV0U_cg,2011-08-05,2,"[01:43:47, 01:52:08]","[CgnHLwWbteqb8qxcZgBLyw, vl2IZrNJEA8npSjqXbdwxw]","[9-VyXjAcQNw3GVue13RJvg, hSLugH4SiJTh9SWCAGRMKw]",2,2
zzPvEodjvLPe-5NvV0U_cg,2011-09-24,1,[00:29:46],[I6u-OhmI_IUltexEOSCKzg],[ntDaumWdq7C9gDlbTURNPw],1,1
zzPvEodjvLPe-5NvV0U_cg,2012-02-01,1,[06:37:18],[Vs7gc9EE3k9wARuUcN9piA],[JNUobt24dhTIGgGZSoRQ5w],1,1
zzPvEodjvLPe-5NvV0U_cg,2012-03-09,1,[19:02:00],[DiCWSyoaBkIpOVUdIO7Jlg],[2RyvRH6FRniZFzv8kIAilg],1,1
zzPvEodjvLPe-5NvV0U_cg,2012-06-07,1,[07:28:32],[4qG3Gh8QyugC1XvoTlmgcQ],[T5COiFgzRi7lFPHkx9dQ3Q],1,1
zzPvEodjvLPe-5NvV0U_cg,2012-09-30,1,[08:36:07],[982dasWTyuKUPbGTifUlEA],[lD0A31B8uHO8VYBNZUR8kg],1,1
zzPvEodjvLPe-5NvV0U_cg,2012-12-06,1,[00:57:54],[w2g2f6D8dgsuTMFw8M_SsA],[tkkP6TN4swFdXKH-CPWrhQ],1,1
zzPvEodjvLPe-5NvV0U_cg,2013-01-11,1,[22:43:14],[_OX2t8sR3AERk-VcTL5JFw],[ZG8TKVlaP4XK8w5mfHdirQ],1,1
zzPvEodjvLPe-5NvV0U_cg,2013-02-07,1,[07:10:59],[SktLID5_ywSlgVE-h3NB3Q],[kJBE8FGCfSHWiWbdkE_z4w],1,1


In [66]:
df_g_user_date[
    (df_g_user_date._id > 1) 
    & (df_g_user_date.index.get_level_values('user_id') == 'zzPvEodjvLPe-5NvV0U_cg')
    & (df_g_user_date.index.get_level_values('_date') == datetime.date(2013, 2, 17))
]#.review_id.values

Unnamed: 0_level_0,Unnamed: 1_level_0,_id,_time,business_id,review_id,count_business,count_review
user_id,_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
zzPvEodjvLPe-5NvV0U_cg,2013-02-17,5,"[19:26:58, 19:26:31, 19:26:11, 19:25:52, 19:26...","[-sjCxkxv6xU5rEVLFybAuA, I01C9Xg_SVyMrQSGIGeQ-...","[7sw5QUix_i1U52tII4qPRA, fZo5AHB5p86z440OhpZ6-...",5,5


In [70]:
reviews[reviews.user_id == 'zzPvEodjvLPe-5NvV0U_cg'].stars.value_counts()

4    7
3    7
2    5
1    2
Name: stars, dtype: int64

In [80]:
reviews[reviews.user_id == 'zzPvEodjvLPe-5NvV0U_cg'].text.apply(lambda x: len(x)).mean()

516.1904761904761

In [76]:
reviews[
    reviews.review_id.isin(
        ['7sw5QUix_i1U52tII4qPRA', 'fZo5AHB5p86z440OhpZ6-w', 
         'hb383dTOk0_03YUwYggLow', 'Q3EaN1JOG6hIfhJKuLrbfQ', 'oR1fpVanlm89wkN6AsZAEw'])].iloc[:, 4:]

Unnamed: 0,stars,date,text,useful,funny,cool,_date,_time
470128,3,2013-02-17 19:26:58,Haven't eaten here in a long time...last time ...,2,1,1,2013-02-17,19:26:58
6408260,4,2013-02-17 19:26:31,Great deal on Valentine's! For $30 you get an ...,1,0,0,2013-02-17,19:26:31
5877845,2,2013-02-17 19:26:11,Whoa...what happened?!?! Place was exceptional...,3,0,0,2013-02-17,19:26:11
2448673,3,2013-02-17 19:25:52,Aw they were out of tres leche...that's what w...,1,0,0,2013-02-17,19:25:52
3724765,4,2013-02-17 19:26:44,Yum! We ordered the 4 for $5 deal....nicely pr...,1,1,0,2013-02-17,19:26:44


### Percen. of Positive Reviews

The majority of spammers have more than 80% of their reviews as 4-5 stars.

### Review Length

The average length of reviews of spammers are generally short due to too much time of writing. Reviews shorter than 140 are generally spammers.

### Reviewer Deviation

Diversion from the general rating consensus. To measure reviewer’s deviation, we first compute the absolute rating deviation of a review from other reviews on the same business. Then, we compute the expected rating deviation of a reviewer over all his reviews. On a 5-star scale, the deviation can range from 0 to 4. Above 2.5 is spammer.

### Maximum Content Similarity

The cosine similarity between any two reviews of a reviewer. This feature should the the lower the better. Non spamers are less than .20 and above .30 is generally spammers.

### Multiple Reviews in a day and Business Locations

Writing multiple reviews to multiple cities is suspicious for a reviewer. If that's the case, then These people might be potential spammers.

## Create Friends Mapping

## Create the Table

### meta table

In [None]:
df.to_sql(name='users_meta', con=engine, index=False)

In [None]:
pd.read_sql_table('users_meta', engine).head(2)