Based on https://towardsdatascience.com/detecting-bad-customer-reviews-with-nlp-d8b36134dc7e

In [20]:
import pandas as pd
import numpy as np

In [3]:
tahini_df = pd.read_csv("salted-tahini-chocolate-chip-cookies.csv")

tahini_df.head()

Unnamed: 0,userDisplayName,commentBody
0,lmk,Yum. These took much longer than 16 minutes t...
1,Sonya,If you follow the recipe as written the tahini...
2,KV,I have made these cookies 5 times. My advice i...
3,MaryN,I liked this- the tahini is slightly more subt...
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...


In [4]:
%%time

# return the wordnet object value corresponding to the POS tag
from nltk.corpus import wordnet

def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    # lower text
    text = text.lower()
    # tokenize text and remove puncutation
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove words that contain numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # pos tag text
    pos_tags = pos_tag(text)
    # lemmatize text
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all
    text = " ".join(text)
    return(text)

# clean text data
tahini_df["review_clean"] = tahini_df["commentBody"].apply(lambda x: clean_text(x))

CPU times: user 5.05 s, sys: 655 ms, total: 5.71 s
Wall time: 11.1 s


In [5]:
# will use an established sentiment analyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer




In [6]:
sid = SentimentIntensityAnalyzer()
tahini_df["sentiments"] = tahini_df["commentBody"].apply(lambda x: sid.polarity_scores(x))
tahini_df = pd.concat([tahini_df.drop(['sentiments'], axis=1), tahini_df['sentiments'].apply(pd.Series)], axis=1)

In [7]:
tahini_df.head()

Unnamed: 0,userDisplayName,commentBody,review_clean,neg,neu,pos,compound
0,lmk,Yum. These took much longer than 16 minutes t...,yum take much long minute cook i'm also denver...,0.05,0.834,0.116,0.5499
1,Sonya,If you follow the recipe as written the tahini...,follow recipe write tahini sesame flavour cook...,0.0,0.876,0.124,0.7501
2,KV,I have made these cookies 5 times. My advice i...,make cooky time advice everything recipe say d...,0.0,0.805,0.195,0.9493
3,MaryN,I liked this- the tahini is slightly more subt...,like tahini slightly subtle pb cookie combine ...,0.035,0.876,0.089,0.5256
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,use shaila m's tweak bake first tray straight ...,0.0,0.817,0.183,0.8625


In [8]:
tahini_df['sentiment'] = 0

In [9]:
# assign a sentiment of 1 if the compound score
# is >=0.75
# else assign a sentiment of 0
# set a compound 
tahini_df.loc[tahini_df.compound >=0.75, 'sentiment'] = 1

In [10]:
tahini_df.head()

Unnamed: 0,userDisplayName,commentBody,review_clean,neg,neu,pos,compound,sentiment
0,lmk,Yum. These took much longer than 16 minutes t...,yum take much long minute cook i'm also denver...,0.05,0.834,0.116,0.5499,0
1,Sonya,If you follow the recipe as written the tahini...,follow recipe write tahini sesame flavour cook...,0.0,0.876,0.124,0.7501,1
2,KV,I have made these cookies 5 times. My advice i...,make cooky time advice everything recipe say d...,0.0,0.805,0.195,0.9493,1
3,MaryN,I liked this- the tahini is slightly more subt...,like tahini slightly subtle pb cookie combine ...,0.035,0.876,0.089,0.5256,0
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,use shaila m's tweak bake first tray straight ...,0.0,0.817,0.183,0.8625,1


In [24]:
y_predicted = np.array(tahini_df.sentiment.values.tolist())

In [11]:
len(tahini_df)

355

In [13]:
tahini_df_with_sentiment = pd.read_csv('salted-tahini-chocolate-chip-cookies-with-sentiment.csv')

In [14]:
tahini_df_with_sentiment.head()

Unnamed: 0,user,comment,sentiment
0,lmk,Yum. These took much longer than 16 minutes t...,pos
1,Sonya,If you follow the recipe as written the tahini...,pos
2,KV,I have made these cookies 5 times. My advice i...,pos
3,MaryN,I liked this- the tahini is slightly more subt...,pos
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,pos


In [17]:
tahini_df_with_sentiment['sentiment_class'] = 0
tahini_df_with_sentiment.loc[tahini_df_with_sentiment['sentiment']=='pos', 'sentiment_class'] = 1
tahini_df_with_sentiment.head()

Unnamed: 0,user,comment,sentiment,sentiment_class
0,lmk,Yum. These took much longer than 16 minutes t...,pos,1
1,Sonya,If you follow the recipe as written the tahini...,pos,1
2,KV,I have made these cookies 5 times. My advice i...,pos,1
3,MaryN,I liked this- the tahini is slightly more subt...,pos,1
4,Maggie B,Used Shaila M's tweaks. Baked first tray strai...,pos,1


In [18]:
len(tahini_df_with_sentiment)

355

In [22]:
y_actual = np.array(tahini_df_with_sentiment.sentiment_class.values.tolist())

In [23]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [25]:
accuracy_score(y_actual, y_predicted)

0.647887323943662

In [27]:
roc_auc_score(y_actual, y_predicted)

0.7083060347965349