https://s3.amazonaws.com/amazon-reviews-pds/readme.html

```

DATA COLUMNS:
marketplace       - 2 letter country code of the marketplace where the review was written.
customer_id       - Random identifier that can be used to aggregate reviews written by a single author.
review_id         - The unique ID of the review.
product_id        - The unique Product ID the review pertains to. In the multilingual dataset the reviews
                    for the same product in different countries can be grouped by the same product_id.
product_parent    - Random identifier that can be used to aggregate reviews for the same product.
product_title     - Title of the product.
product_category  - Broad product category that can be used to group reviews 
                    (also used to group the dataset into coherent parts).
star_rating       - The 1-5 star rating of the review.
helpful_votes     - Number of helpful votes.
total_votes       - Number of total votes the review received.
vine              - Review was written as part of the Vine program.
verified_purchase - The review is on a verified purchase.
review_headline   - The title of the review.
review_body       - The review text.
review_date       - The date the review was written.
```

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('amazon_reviews_us_Grocery_v1_00.tsv.gz', 
                 nrows=40000, sep='\t', error_bad_lines=False)

df = df.drop(['marketplace', 'product_category'], axis=1)

df = df[~df['review_body'].isnull()]

df['review_body'] = df['review_headline'] + '. ' + df['review_body']

df = df[df['review_body'].apply(len) > 40]

b'Skipping line 1925: expected 15 fields, saw 22\nSkipping line 1977: expected 15 fields, saw 22\nSkipping line 35265: expected 15 fields, saw 22\n'


In [2]:
df.head(5)

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,42521656,R26MV8D0KG6QI6,B000SAQCWC,159713740,"The Cravings Place Chocolate Chunk Cookie Mix,...",5,0,0,N,Y,Using these for years - love them.,Using these for years - love them.. As a famil...,2015-08-31
1,12049833,R1OF8GP57AQ1A0,B00509LVIQ,138680402,"Mauna Loa Macadamias, 11 Ounce Packages",5,0,0,N,Y,Wonderful,"Wonderful. My favorite nut. Creamy, crunchy, ...",2015-08-31
2,107642,R3VDC1QB6MC4ZZ,B00KHXESLC,252021703,Organic Matcha Green Tea Powder - 100% Pure Ma...,5,0,0,N,N,Five Stars,Five Stars. This green tea tastes so good! My ...,2015-08-31
3,6042304,R12FA3DCF8F9ER,B000F8JIIC,752728342,15oz Raspberry Lyons Designer Dessert Syrup Sauce,5,0,0,N,Y,Five Stars,Five Stars. I love Melissa's brand but this is...,2015-08-31
6,32778285,R1S1XSG4ZCHDGS,B00BCNSTRA,578681693,"Larabar uber, 1.42 Ounce (Pack of 15)",5,1,1,N,Y,Five Stars,"Five Stars. Love these bars, but have to watch...",2015-08-31


- any user created >5 reviews in a single day
- any user created >=3 reviews with average star rating <=2
- any reviews appeared twice or more (reviews with same content)

# manipulated/useless reviews

In [3]:
bad_customer = []

In [4]:
customer_count = df.groupby(['customer_id', 'review_date'])['product_id'].count().reset_index()
customer_count.columns = ['customer_id', 'review_date', 'review_count']

customer_count = customer_count[customer_count['review_count'] > 5]
customer_count.sort_values(by='review_count')
# number of reviews created by each user

Unnamed: 0,customer_id,review_date,review_count
0,10805,2015-08-19,6
9985,20613913,2015-08-25,6
10787,22476158,2015-08-21,6
11026,22943881,2015-08-22,6
11169,23220887,2015-08-22,6
...,...,...,...
16492,36290808,2015-08-31,15
16911,37293783,2015-08-19,17
1093,1535682,2015-08-28,17
10027,20674418,2015-08-29,18


In [5]:
bad_customer += list(customer_count['customer_id'])
len(bad_customer)

118

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
review_tfidf = tfidf.fit_transform(df['review_body'])

In [7]:
IDX = 0

# 1000 reviews per time
for IDX_start in range(int(review_tfidf.shape[0]/1000) +1):
    print(IDX_start)
    
    idss = np.dot(
        review_tfidf[
            IDX_start*1000:
            (IDX_start+1)*1000
        ], 
        review_tfidf.T
    ).toarray()
    # 0， 1
    

    for ids in idss:
        IDX += 1

        if len(np.where(ids >= 0.99)[0])==1:
            continue
        
        # print(np.where(ids >= 0.99)[0])
        
        df.iloc[np.where(ids >= 0.99)[0]][['customer_id', 'review_id', 'star_rating', 'review_body']].sort_values(by='star_rating').to_csv('t.csv', index=None, header=None, mode='a')
        
        # bad_list.extend(list(np.where(ids >= 0.99)[0]))
        bad_customer += list(df.iloc[np.where(ids >= 0.99)[0]]['customer_id'])
        # A -> B
        # B -> A

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


In [8]:
 df.iloc[np.where(ids >= 0.99)[0]][
     ['customer_id', 'review_id', 'review_body']].to_csv('t.csv', index=None, header=None, mode='a')

In [9]:
len(bad_customer)

2768

In [10]:
customer_rate_mean = df.groupby(['customer_id'])['star_rating'].mean()
customer_rate_count = df['customer_id'].value_counts()

customer_rate_mean = customer_rate_mean.reset_index()
customer_rate_mean.columns = ['customer_id', 'rating']

customer_rate_count = customer_rate_count.reset_index()
customer_rate_count.columns = ['customer_id', 'count']

In [11]:
customer_df = pd.merge(customer_rate_mean, customer_rate_count, on='customer_id')
customer_df.sort_values(by='count', ascending=False)

Unnamed: 0,customer_id,rating,count
8251,17162238,4.632653,49
15292,34247947,4.906250,32
15696,35178127,4.454545,22
9824,20674418,5.000000,18
16560,37293783,5.000000,17
...,...,...,...
9496,19967776,2.000000,1
9495,19963619,1.000000,1
9494,19963378,1.000000,1
9493,19962730,5.000000,1


In [12]:
customer_df[(customer_df['count'] >= 3) & (customer_df['rating'] <=2)]

Unnamed: 0,customer_id,rating,count
1797,3263580,1.0,3
1928,4020516,1.666667,3
3071,9927828,2.0,3
3787,10985996,1.0,3
3800,11004972,1.666667,3
5474,13291352,2.0,3
6498,14606512,1.0,6
6526,14646725,1.0,3
7356,15728302,1.0,3
7777,16355599,1.0,3


In [13]:
bad_customer += list(customer_df[(customer_df['count'] >= 3) & (customer_df['rating'] <=2)]['customer_id'])

In [14]:
select_customer = customer_df[(customer_df['count'] >= 3) & (customer_df['rating'] <=2)]['customer_id']

df[df['customer_id'].isin(select_customer)][['customer_id','review_headline', 'review_body']].sort_values(by='customer_id')

Unnamed: 0,customer_id,review_headline,review_body
18664,3263580,Refund,"Refund. Damaged, pass expire racin date, not f..."
17676,3263580,Refund,"Refund. It was bad no top , damaged, missing p..."
13721,3263580,REFUND NOW,REFUND NOW. I want a a refund Now IT WAS ALL G...
10355,4020516,Two Stars,Two Stars. Small bag the price taking into con...
9722,4020516,"A very small bag compared with the prize,","A very small bag compared with the prize,. A v..."
...,...,...,...
32933,49226840,Has sugar in it.,Has sugar in it.. Has sugar in it. According t...
33517,49226840,Too little for so much,Too little for so much. Well I tried it once. ...
10197,52940403,Wasted money. It is also very thin,Wasted money. It is also very thin. It has a f...
12780,52940403,Are you kidding me? 100 dollars for a small ...,Are you kidding me? 100 dollars for a small .....


# filter

In [15]:
df = df[~df['customer_id'].isin(bad_customer)]

In [16]:
import re
bad_words = ['contaminate','disease', 'disgusting',
             'eaten', 'hazardous', 'ill', 'moldy', 'mldew',
             'musty','reeking', 'smelly', 'stale', 'unhealthy']

In [17]:
df_bad = df[df['review_body'].apply(lambda x: bool(re.search('|'.join(bad_words), x.lower())))]

In [18]:
df_bad[df_bad['star_rating'] == 1]['customer_id'].value_counts()

22220300    2
47180254    2
16147806    2
41617771    2
21540019    2
           ..
24108725    1
29366966    1
26375863    1
14005945    1
5817935     1
Name: customer_id, Length: 560, dtype: int64

## sentiment

- positive/negative
- part of speech of words

In [22]:
from textblob import TextBlob

# https://textblob.readthedocs.io/en/dev/
text = '''
The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant.
'''

blob = TextBlob(text)

for sentence in blob.sentences:
    print(sentence, sentence.sentiment.polarity, sentence.sentiment.subjectivity)
    print('')


The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact. 0.06000000000000001 0.605

Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant. -0.34166666666666673 0.7666666666666666



In [23]:
# sentiment score <-0.5 with including the bad words -> related to hazardous product

bad_comment = []

from tqdm import tqdm
for line in tqdm(df[df['star_rating'] == 1]['review_body'].iloc[:]):
    line = line.replace('<br />', '')
    
    blob = TextBlob(line)
    for sentence in blob.sentences:
        # print(sentence, sentence.sentiment.polarity, sentence.sentiment.subjectivity)
        if sentence.sentiment.polarity < -0.5 and re.search('|'.join(bad_words), line.lower()):
            bad_comment.append(line)
            

100%|██████████| 2418/2418 [00:02<00:00, 967.93it/s] 


In [36]:
# sentiment score >0.5 without including the bad words -> not related to the hazardous product

good_comment = []

from tqdm import tqdm
for line in tqdm(df[df['star_rating'] == 5]['review_body'].iloc[:500]):
    line = line.replace('<br />', '')
    
    blob = TextBlob(line)
    for sentence in blob.sentences:
        # print(sentence, sentence.sentiment.polarity, sentence.sentiment.subjectivity)
        if sentence.sentiment.polarity > 0.5 and not re.search('|'.join(bad_words), line.lower()):
            good_comment.append(line)
            

100%|██████████| 500/500 [00:00<00:00, 1010.73it/s]


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [37]:
# 0-bad
label = [0] * len(bad_comment) + [1] * len(good_comment)
text = bad_comment + good_comment

label = np.array(label) 
# text = np.array(text)

tfidf = TfidfVectorizer().fit(text)
text_tfidf = tfidf.transform(text)

In [38]:
text_tfidf 

<753x3348 sparse matrix of type '<class 'numpy.float64'>'
	with 27429 stored elements in Compressed Sparse Row format>

In [39]:
from sklearn.model_selection import train_test_split
# 744 * 0.8
train_text, test_text, train_label, test_label = train_test_split(text_tfidf, label, train_size=0.8,stratify=label)

In [40]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(train_text, train_label)



In [41]:
clf.score(test_text, test_label)

0.9801324503311258

In [43]:
def predict_food_comment(s, show=True):
    s = np.array(s).reshape(1)
    s_tfidf = tfidf.transform(s)
    # [bad, good] 
    if clf.predict_proba(s_tfidf)[0, 0] > 0.4:
        if show:
            print(s, 'bad')
        return 'bad'
    else:
        if show:
            print(s, 'good')
        return 'good'

In [44]:
predict_food_comment('''This is the weirdest "soup" I ever tasted.''')

['This is the weirdest "soup" I ever tasted.'] bad


'bad'

In [45]:
predict_food_comment('''My stomach hurts after I eat it once''')

['My stomach hurts after I eat it once'] bad


'bad'

In [46]:
predict_food_comment('''Very delicious, very good comment''')

['Very delicious, very good comment'] good


'good'

In [47]:
predict_food_comment('''Using these for years - love them.. As a family allergic to wheat''')

['Using these for years - love them.. As a family allergic to wheat'] good


'good'

In [48]:
import codecs
for x in df[df['star_rating'] == 1]['review_body'].iloc[:]:
    if predict_food_comment(x, show=False) == 'bad':
        with codecs.open('bad.csv', 'a') as up:
            up.write(x+'\n')