Based on https://towardsdatascience.com/detecting-bad-customer-reviews-with-nlp-d8b36134dc7e

In [1]:
import pandas as pd

reviews_df = pd.read_csv("salted-tahini-chocolate-chip-cookies.csv")

reviews_df.head()


Unnamed: 0,userDisplayName,commentBody
0,lmk,Yum. These took much longer than 16 minutes t...
1,Sonya,If you follow the recipe as written the tahini...
2,KV,I have made these cookies 5 times. My advice i...
3,MaryN,I liked this- the tahini is slightly more subt...
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...


In [2]:
%%time

# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
reviews_df["review_clean"] = reviews_df["commentBody"].apply(lambda x: clean_text(x))

CPU times: user 4.73 s, sys: 667 ms, total: 5.4 s
Wall time: 10.1 s


In [3]:
reviews_df

Unnamed: 0,userDisplayName,commentBody,review_clean
0,lmk,Yum. These took much longer than 16 minutes t...,yum take much long minute cook i'm also denver...
1,Sonya,If you follow the recipe as written the tahini...,follow recipe write tahini sesame flavour cook...
2,KV,I have made these cookies 5 times. My advice i...,make cooky time advice everything recipe say d...
3,MaryN,I liked this- the tahini is slightly more subt...,like tahini slightly subtle pb cookie combine ...
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,use shaila m's tweak bake first tray straight ...
...,...,...,...
350,Stephanie Bellows,I hate to ask - but the calories for this wond...,hate ask calorie wonderful cooky
351,Amanda,Can you make these with a small cookie scoop a...,make small cookie scoop amount burn small
352,Audrey,Can something be substituted for the Tahini? S...,something substitute tahini sesame allergy
353,C PAULL,RE: TAHINI Sauce or paste???,tahini sauce paste


In [4]:
# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
reviews_df["sentiments"] = reviews_df["commentBody"].apply(lambda x: sid.polarity_scores(x))
reviews_df = pd.concat([reviews_df.drop(['sentiments'], axis=1), reviews_df['sentiments'].apply(pd.Series)], axis=1)



In [5]:
reviews_df

Unnamed: 0,userDisplayName,commentBody,review_clean,neg,neu,pos,compound
0,lmk,Yum. These took much longer than 16 minutes t...,yum take much long minute cook i'm also denver...,0.050,0.834,0.116,0.5499
1,Sonya,If you follow the recipe as written the tahini...,follow recipe write tahini sesame flavour cook...,0.000,0.876,0.124,0.7501
2,KV,I have made these cookies 5 times. My advice i...,make cooky time advice everything recipe say d...,0.000,0.805,0.195,0.9493
3,MaryN,I liked this- the tahini is slightly more subt...,like tahini slightly subtle pb cookie combine ...,0.035,0.876,0.089,0.5256
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,use shaila m's tweak bake first tray straight ...,0.000,0.817,0.183,0.8625
...,...,...,...,...,...,...,...
350,Stephanie Bellows,I hate to ask - but the calories for this wond...,hate ask calorie wonderful cooky,0.143,0.487,0.369,0.6920
351,Amanda,Can you make these with a small cookie scoop a...,make small cookie scoop amount burn small,0.000,0.909,0.091,0.1531
352,Audrey,Can something be substituted for the Tahini? S...,something substitute tahini sesame allergy,0.000,1.000,0.000,0.0000
353,C PAULL,RE: TAHINI Sauce or paste???,tahini sauce paste,0.000,1.000,0.000,0.0000


In [6]:
# add number of characters column
reviews_df["nb_chars"] = reviews_df["commentBody"].apply(lambda x: len(x))

# add number of words column
reviews_df["nb_words"] = reviews_df["commentBody"].apply(lambda x: len(x.split(" ")))

In [7]:
reviews_df

Unnamed: 0,userDisplayName,commentBody,review_clean,neg,neu,pos,compound,nb_chars,nb_words
0,lmk,Yum. These took much longer than 16 minutes t...,yum take much long minute cook i'm also denver...,0.050,0.834,0.116,0.5499,192,38
1,Sonya,If you follow the recipe as written the tahini...,follow recipe write tahini sesame flavour cook...,0.000,0.876,0.124,0.7501,282,51
2,KV,I have made these cookies 5 times. My advice i...,make cooky time advice everything recipe say d...,0.000,0.805,0.195,0.9493,420,77
3,MaryN,I liked this- the tahini is slightly more subt...,like tahini slightly subtle pb cookie combine ...,0.035,0.876,0.089,0.5256,344,64
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,use shaila m's tweak bake first tray straight ...,0.000,0.817,0.183,0.8625,295,49
...,...,...,...,...,...,...,...,...,...
350,Stephanie Bellows,I hate to ask - but the calories for this wond...,hate ask calorie wonderful cooky,0.143,0.487,0.369,0.6920,60,12
351,Amanda,Can you make these with a small cookie scoop a...,make small cookie scoop amount burn small,0.000,0.909,0.091,0.1531,90,18
352,Audrey,Can something be substituted for the Tahini? S...,something substitute tahini sesame allergy,0.000,1.000,0.000,0.0000,60,9
353,C PAULL,RE: TAHINI Sauce or paste???,tahini sauce paste,0.000,1.000,0.000,0.0000,28,5


In [8]:
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews_df["review_clean"].apply(lambda x: x.split(" ")))]

# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

# transform each document into a vector data
doc2vec_df = reviews_df["review_clean"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
reviews_df = pd.concat([reviews_df, doc2vec_df], axis=1)

In [9]:
reviews_df

Unnamed: 0,userDisplayName,commentBody,review_clean,neg,neu,pos,compound,nb_chars,nb_words,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4
0,lmk,Yum. These took much longer than 16 minutes t...,yum take much long minute cook i'm also denver...,0.050,0.834,0.116,0.5499,192,38,0.062290,0.178868,0.016344,0.048699,-0.078398
1,Sonya,If you follow the recipe as written the tahini...,follow recipe write tahini sesame flavour cook...,0.000,0.876,0.124,0.7501,282,51,-0.036574,-0.006180,-0.071720,-0.007925,0.091834
2,KV,I have made these cookies 5 times. My advice i...,make cooky time advice everything recipe say d...,0.000,0.805,0.195,0.9493,420,77,0.168327,0.225310,-0.080924,-0.125021,0.090596
3,MaryN,I liked this- the tahini is slightly more subt...,like tahini slightly subtle pb cookie combine ...,0.035,0.876,0.089,0.5256,344,64,-0.027194,0.010966,-0.007126,0.045270,-0.011999
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,use shaila m's tweak bake first tray straight ...,0.000,0.817,0.183,0.8625,295,49,0.087974,0.220631,-0.004066,-0.076745,0.033950
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,Stephanie Bellows,I hate to ask - but the calories for this wond...,hate ask calorie wonderful cooky,0.143,0.487,0.369,0.6920,60,12,-0.009293,-0.034945,0.028141,-0.011038,0.051403
351,Amanda,Can you make these with a small cookie scoop a...,make small cookie scoop amount burn small,0.000,0.909,0.091,0.1531,90,18,-0.084270,0.026410,-0.091639,-0.071335,0.094652
352,Audrey,Can something be substituted for the Tahini? S...,something substitute tahini sesame allergy,0.000,1.000,0.000,0.0000,60,9,-0.017896,0.110744,-0.107110,0.037301,0.090463
353,C PAULL,RE: TAHINI Sauce or paste???,tahini sauce paste,0.000,1.000,0.000,0.0000,28,5,-0.055699,-0.073661,0.029431,0.039894,0.002093
