In [11]:
#disable warnings
import warnings
warnings.filterwarnings("ignore") 

In [12]:
#stuff

# Purpose of this Notebook

In [38]:
#Number of reviews to put into data frame of good reviews. Set it to less than 0 if you want all of the good reviews to be processed
NUMB_OF_DOCS1 = -1

#Number of reviews to put into data frame of bad reviews
NUMB_OF_DOCS2=100

#set to 1 if you want to reclean the dataframes, otherwise it will load data from a previous clean
#note cleaning takes 30 minutes on a pretty fast machine
CLEAN_DATAFRAME = 0

## Problem Statement

Product developers often want to know this about their products:

* For products that people don't like, what is it about the products they don't like? Maybe if we know this we could improve the product by fixing this unliked product features
* For products that people love, what is it about the products that result in people loving them? Maybe if we expanded on these loved features, we could get new customers or better retain current customers

## Proposed Solution

Topic modeling and clustering(?) that identifies the features of products that people love and somehow rank them so that a product manaer can have a priority list of things that could be addressed to get more love
Topic modeling and clustering(?) that identifies the features of products that people don't like and somehow rank them so that a product manaer can have a priority list of things that could be fixed to hopefully turn the disliked product into a liked product.


# Module Import

In [66]:



import datetime
from collections import Counter
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



import re

#to speed up pandas operands
from pandarallel import pandarallel

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import pyLDAvis
import pyLDAvis.sklearn

#gensim
from gensim import corpora, models, similarities, matutils



# NLP Pipeline



## Text Preprocessing
Essentially will do this:

```

(input text) -->[clean(remove text I don't want)]--> [tokenization] --> [lemmatization] --> [Next_Stages]

```

In [15]:
reviews_df = pd.read_csv("../Data/Reviews.csv")

In [16]:
reviews_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [17]:
reviews_df.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


In [18]:
reviews_df.loc[0,"Text"]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [19]:
reviews_df["Score"].value_counts()

5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64

According to the Kaggle description `HelpfulnessNumerator` should be less than or equal to `HelpfulnessDenominator`. So I will filter out any reviews that do not meet this specification.  Also, removing reviews where `HelpfulnessDenominator` is zero as this indicates a review that is not useful:

In [20]:
mask = (reviews_df["HelpfulnessNumerator"] <=  reviews_df["HelpfulnessDenominator"]) & (reviews_df["HelpfulnessDenominator"]!=0)
reviews_df[~mask].shape

(270054, 10)

There are few to remove so doing that now:

In [21]:
reviews_df = reviews_df[mask]

Are there any duplicate entries?

In [22]:
reviews_df.duplicated(subset=["UserId","ProfileName","Time","Text"],keep="first").value_counts()

False    209681
True      88719
dtype: int64

In [23]:
reviews_df.drop_duplicates(subset=["UserId","ProfileName","Time","Text"],keep="first",inplace=True)

In [24]:
reviews_df.dropna(inplace=True)

calculating a helpfulness metric in the hopes that it helps separate reviews.

In [25]:
reviews_df["helpful_rating"] = reviews_df["HelpfulnessNumerator"] / reviews_df["HelpfulnessDenominator"]

In [26]:
# Not helpful
# TODO: Delete
#plt.figure(figsize=(40,40))
#plt.scatter(reviews_df["Score"],reviews_df["helpful_rating"],alpha=0.5)
#plt.xlabel("Score")
#plt.ylabel("helpful rating")
#plt.show();

Going to create a dataframe of bad reviews and one of good reviews to process separately. I remove reviews where "Score" == 3 because do not indicate a call to action to either improve a bad product or point out a product that likely has nice features.

In [27]:
reviews_df.shape

(209675, 11)

In [28]:
good_reviews_df = reviews_df[ reviews_df["Score"] >= 4 ]
bad_reviews_df = reviews_df[ reviews_df["Score"] <= 2 ]

In [29]:
if (NUMB_OF_DOCS1 > good_reviews_df.shape[0]) or (NUMB_OF_DOCS1 < 0):
    NUMB_OF_DOCS1 = good_reviews_df.shape[0]


if (NUMB_OF_DOCS2 > bad_reviews_df.shape[0]) or (NUMB_OF_DOCS2 < 0):
    NUMB_OF_DOCS2 = bad_reviews_df.shape[0]

In [30]:
#stuff = list(good_reviews_df.Text.head(NUMB_OF_DOCS1))

Thoughts on mispelled words and non English words:

What I'm trying to do here (TODO describe this) is identify adjectives. So I will remove words that are not in nltk's corpus of words to hopefully remove misspelled words and non-English words.

In [31]:
#Removing HTML from the text
#stuff = [re.sub("<.*?>","",item) for item in stuff]

def remove_html_punct(row):
    """This function removes HTML and punctuation and anything that is or contains a number from the Text"""
    
    NLTK_WORDS = set(nltk.corpus.words.words())
    
    text_to_process = row["Text"]
    text_to_process = text_to_process.lower()
    text_to_process = re.sub("<.*?>","",text_to_process)
    text_to_process = re.sub("[\.|\!|\?|\,|\;|\:|\&|\(|\)|\-|\%|_]","",text_to_process)
    text_to_process = re.sub("\d+\w+","",text_to_process)
    text_to_process_list = text_to_process.split()
    text_to_process_list = [review_word for review_word in text_to_process_list if review_word in NLTK_WORDS]
    text_to_process = " ".join(text_to_process_list)
    return text_to_process



In [33]:
if CLEAN_DATAFRAME == 1:

    #TODO If you have time - convert it to a SQL query
    pandarallel.initialize()

    start_time = datetime.datetime.now()

    good_reviews_df["ProcessedText"] = good_reviews_df.parallel_apply(remove_html_punct,axis=1)

    end_time = datetime.datetime.now()
    print("time spent doing operation is {}".format(end_time - start_time))
else:
    print("skipping cleaning the data frame.")

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
time spent doing operation is 0:30:06.271101


In [34]:
good_reviews_df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,helpful_rating,ProcessedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1.0,i have bought several of the vitality canned d...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1.0,this is a confection that been around a few it...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...,1.0,right now mostly just sprouting this so my can...
10,11,B0001PB9FE,A3HDKO7OW0QNK4,Canadian Fan,1,1,5,1107820800,The Best Hot Sauce in the World,I don't know if it's the cactus or the tequila...,1.0,i know if the or the tequila or just the uniqu...
11,12,B0009XLVG0,A2725IB4YY9JEB,"A Poeng ""SparkyGoHome""",4,4,5,1282867200,"My cats LOVE this ""diet"" food better than thei...",One of my boys needed to lose some weight and ...,1.0,one of my to lose some weight and the other i ...


It takes a long time to perform this operation so going to save it to a pickle file if I ever need to rerun but don't need to do this cleaning step.

In [37]:
if CLEAN_DATAFRAME == 1:
    GOOD_REVIEWS_FILE = open("../Data/good_reviews.pkl","wb")
    pickle.dump(good_reviews_df,GOOD_REVIEWS_FILE)
    GOOD_REVIEWS_FILE.close()
else:
    GOOD_REVIEWS_FILE = open("../Data/good_reviews.pkl","rb")
    good_reviews_df = pickle.load(GOOD_REVIEWS_FILE)

In [None]:
good_reviews_df.loc[0,"Text"]

In [None]:
good_reviews_df.loc[0,"ProcessedText"]

**Tokenize**

In [40]:
#stuff_tokenized= []
#stuff_tokenized = [word_tokenize(review) for review in stuff]

def tokenize_text(row):
    temp_tokenized_txt = word_tokenize(row["ProcessedText"])
    return temp_tokenized_txt


In [41]:
pandarallel.initialize()
start_time = datetime.datetime.now()
good_reviews_df["ProcessedText"] = good_reviews_df.parallel_apply(tokenize_text,axis=1)
end_time = datetime.datetime.now()
print("time spent doing operation is {}".format(end_time - start_time))

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
time spent doing operation is 0:00:05.134558


In [42]:
good_reviews_df.loc[0,"ProcessedText"]

['i',
 'have',
 'bought',
 'several',
 'of',
 'the',
 'vitality',
 'canned',
 'dog',
 'food',
 'and',
 'have',
 'found',
 'them',
 'all',
 'to',
 'be',
 'of',
 'good',
 'quality',
 'the',
 'product',
 'more',
 'like',
 'a',
 'stew',
 'than',
 'a',
 'meat',
 'and',
 'it',
 'better',
 'my',
 'is',
 'and',
 'she',
 'this',
 'product',
 'better',
 'than',
 'most']

In [43]:
#for i,item in enumerate(stuff_tokenized):
#    if i == 10:
#        break
#    print(i,"-->",item,"\n")

In [44]:
wordNetLemmatizer = WordNetLemmatizer()

In [45]:
#lemmatize each word in stuff
def calc_partofspeech(raw_pos):
    """translates from POS generated by pos_tag() to a POS encoding that WordNetLemmatizer.lemmatize() understands"""
    #print("word = ",word)
    #[(output_word, output_pos)] = pos_tag(word_tokenize(word))
    
    #decode output_pos to the pos required by the lemmatizer
    
    if "JJ" in raw_pos:
        pos = "a"
    elif "RB" in raw_pos:
        pos = "r"
    elif "VB" in raw_pos:
        pos = "v"
    else:
        pos = "n"
    
    return pos




In [46]:
#for i,item in enumerate(stuff):
#   if i == 10:
#        break
#   print(i,"-->",item,"\n")

In [47]:
#stuff_tokenized[0]

In [48]:
def lemmatize_it(row):
    
    pos_result = pos_tag(row["ProcessedText"])
    
    temp_mydoc_lemmatized = []
    
    for myword,myPOS in pos_result:
        temp_word_lemmatized = wordNetLemmatizer.lemmatize(myword,calc_partofspeech(myPOS))
        temp_mydoc_lemmatized.append(temp_word_lemmatized)
    
    return temp_mydoc_lemmatized
        


In [49]:
pandarallel.initialize()
start_time = datetime.datetime.now()
good_reviews_df["ProcessedText"] = good_reviews_df.parallel_apply(lemmatize_it,axis=1)
end_time = datetime.datetime.now()
print("time spent doing operation is {}".format(end_time - start_time))

INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
time spent doing operation is 0:00:41.067128


lemmatizing to cut down on the size of the dimensions of the eventual vectorized word matrix without losing meaning of the text

In [50]:
good_reviews_df.loc[0,"ProcessedText"] 

['i',
 'have',
 'buy',
 'several',
 'of',
 'the',
 'vitality',
 'can',
 'dog',
 'food',
 'and',
 'have',
 'find',
 'them',
 'all',
 'to',
 'be',
 'of',
 'good',
 'quality',
 'the',
 'product',
 'more',
 'like',
 'a',
 'stew',
 'than',
 'a',
 'meat',
 'and',
 'it',
 'good',
 'my',
 'be',
 'and',
 'she',
 'this',
 'product',
 'good',
 'than',
 'most']

In [51]:
#Test code doesn't need to be run
#myword = "running"
#myword_lemmed = wordNetLemmatizer.lemmatize(myword,calc_partofspeech(myword))
#myword_lemmed = wordNetLemmatizer.lemmatize(myword,"v")

#print("myword_lemmed = ",myword_lemmed)
#print("POS for myword = ",calc_partofspeech(myword))

In [52]:
#stuff_lemmatized[0]

In [53]:
# I only want adjectives and adverbs sent to the vectorizer
#stuff_pos_filtered = [mylemword for mylemdoc in stuff_lemmatized for mylemword in mylemdoc if ("JJ" in pos_tag(word_tokenize(mylemword))[0][1]) or ("RB" in pos_tag(word_tokenize(mylemword))[0][1])]

In [54]:
#stuff_pos_filtered[6]

In [55]:
# I only want adjectives and adverbs sent to the vectorizer

def filter_pos(row,pos_abbrev=["JJ"]):
    """
    takes as input the following
    processes the "ProcessedText" column of a review data frame (see definition above)
    pos_abbrev = list of NLTK pos_tag() output parts of speech codes abbreviated to the first two characters
    
    """
#    print("pos_abbrev = ",pos_abbrev)

    
    if type(pos_abbrev) != type([]):
        print("post_abbrev must be a list of POS codes (1st 2 characters of the code only)")
        return 1
    
    temp_pos_filtered_doc = ""
    
    #pos_tag() works better if you pass the entire document to it as a tokenized list
    #it must be doing some sequence modeling under the hood to get the context of words so that it can
    #better ID parts of speech
        
    pos_result = pos_tag(row["ProcessedText"])
    
    for result_word, result_pos in pos_result:
        #print("tokenized word = ", word_tokenize(mylemword))
        #print("tokenized word pos = ", pos_tag(word_tokenize(mylemword))[0][1])
        
        for mypos in pos_abbrev:
            if (mypos in result_pos) :
                temp_pos_filtered_doc = temp_pos_filtered_doc + result_word + " "
            
    
    return temp_pos_filtered_doc.rstrip()


In [56]:
#stuff_pos_filtered = filter_pos(stuff_lemmatized,pos_abbrev=["JJ","RB"])
#stuff_pos_filtered
#stuff_pos_filtered = stuff_pos_filtered + filter_pos(stuff_lemmatized,"RB")
pandarallel.initialize()
start_time = datetime.datetime.now()
good_reviews_df["ProcessedText"] = good_reviews_df.parallel_apply(filter_pos,axis=1,pos_abbrev=["JJ","RB"])
end_time = datetime.datetime.now()
print("time spent doing operation is {}".format(end_time - start_time))


INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
time spent doing operation is 0:00:18.966132


In [57]:
good_reviews_df.loc[0,"ProcessedText"]

'several good more good good most'

In [58]:
#stuff_lemmatized[0]

Get the nouns

In [59]:
#stuff_nouns = filter_pos(stuff_lemmatized,pos_abbrev=["NN"])

In [60]:
#stuff_nouns[0:3]

In [61]:
#stuff_pos_filtered[0:5]

In [62]:
#len(stuff_pos_filtered)

In [63]:
good_reviews_df.shape

(152586, 12)

## Word Vectorization / Parsing

Will do the following here:

```

(lemmatized text)-->[WordVectorization(dropping stop word)] --> [POS tagging] --> [collect Adjectives]


```


In [64]:
vectorizer = CountVectorizer(stop_words="english")
docTermMatrixCountVec = vectorizer.fit_transform(good_reviews_df["ProcessedText"])

In [65]:
len(vectorizer.get_feature_names())

12149

In [67]:
docTermMatrixCountVec.toarray()

  and should_run_async(code)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [68]:
docTermMatrixCountVec.shape

  and should_run_async(code)


(152586, 12149)

In [69]:

docTermMatrixCountVec_df = pd.DataFrame(docTermMatrixCountVec.toarray(), index=good_reviews_df["ProcessedText"], columns=vectorizer.get_feature_names())

  and should_run_async(code)


In [70]:
docTermMatrixCountVec_df.head()

  and should_run_async(code)


Unnamed: 0_level_0,aa,abandon,abandoned,abb,abbey,abdominal,abhor,abide,abiding,abject,...,zero,zest,zestful,zesty,zinc,zip,zipper,zippy,zombie,zucchini
ProcessedText,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
several good more good good most,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
few light tiny then liberally powdered tiny not too chewy very flavorful highly familiar,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
right now mostly just i too,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
just unique hot once u totally when simply anywhere ecstatic hot really hot tastelessly tequila just once never other personal incredible,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
other up high where only skinny high really,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
docTermMatrixCountVec_df.columns

  and should_run_async(code)


Index(['aa', 'abandon', 'abandoned', 'abb', 'abbey', 'abdominal', 'abhor',
       'abide', 'abiding', 'abject',
       ...
       'zero', 'zest', 'zestful', 'zesty', 'zinc', 'zip', 'zipper', 'zippy',
       'zombie', 'zucchini'],
      dtype='object', length=12149)

In [77]:
LD_Allocator = LatentDirichletAllocation(n_components=20, random_state=0,n_jobs=8)


  and should_run_async(code)


In [78]:
start_time = datetime.datetime.now()
LD_Allocator.fit(docTermMatrixCountVec)
end_time = datetime.datetime.now()
print("time spent doing operation is {}".format(end_time - start_time))

  and should_run_async(code)


time spent doing operation is 0:01:29.608129


## Topics via LDA

## LDA on "Good" Reviews

In [79]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(LD_Allocator, docTermMatrixCountVec, vectorizer)

  and should_run_async(code)
