## Introduction & Cleaning

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("Reviews_Cut.csv")

In [2]:
df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [3]:
# Lowercase the text values in 'Text'

df['Text'] = df['Text'].str.lower()
df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,i have bought several of the vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,product arrived labeled as jumbo salted peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",this is a confection that has been around a fe...


In [4]:
# Remove http syntax and line breaks

import re
def clean(s):
    s = re.sub(r'<br */*>', "\n", s)
    s = re.sub(r'<a href=', "", s)
    s = re.sub(r'\"*https*://[^\)]*\"*', "", s)
    s = re.sub(r'-', "", s)
    s = re.sub(r'&#34;', "'", s)
    return str(s)

In [5]:
# Apply the clean code function to the dataframe: 1) to the Text portion of the review, and 2) to the Profile Name

df["selftext_clean"] = ''
for i, row in df.iterrows():
    df.at[i, "selftext_clean"] = clean(row.Text)
    

df["ProfileClean"] = ''
for i, row in df.iterrows():
    df.at[i, "ProfileClean"] = clean(row.ProfileName)

In [6]:
pd.set_option('display.max_colwidth', None)
df.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,selftext_clean,ProfileClean
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.,i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.,delmartian
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,"product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as ""jumbo"".","product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as ""jumbo"".",dll pa
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all","this is a confection that has been around a few centuries. it is a light, pillowy citrus gelatin with nuts - in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar. and it is a tiny mouthful of heaven. not too chewy, and very flavorful. i highly recommend this yummy treat. if you are familiar with the story of c.s. lewis' ""the lion, the witch, and the wardrobe"" - this is the treat that seduces edmund into selling out his brother and sisters to the witch.","this is a confection that has been around a few centuries. it is a light, pillowy citrus gelatin with nuts in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar. and it is a tiny mouthful of heaven. not too chewy, and very flavorful. i highly recommend this yummy treat. if you are familiar with the story of c.s. lewis' ""the lion, the witch, and the wardrobe"" this is the treat that seduces edmund into selling out his brother and sisters to the witch.","Natalia Corres ""Natalia Corres"""


In [7]:
# Drop the old Profile Name

df = df.drop('ProfileName', axis=1)
df.head(3)

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,selftext_clean,ProfileClean
0,1,B001E4KFG0,A3SGXH7AUHU8GW,1,1,5,1303862400,Good Quality Dog Food,i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.,i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.,delmartian
1,2,B00813GRG4,A1D87F6ZCVE5NK,0,0,1,1346976000,Not as Advertised,"product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as ""jumbo"".","product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as ""jumbo"".",dll pa
2,3,B000LQOCH0,ABXLMWJIXXAIN,1,1,4,1219017600,"""Delight"" says it all","this is a confection that has been around a few centuries. it is a light, pillowy citrus gelatin with nuts - in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar. and it is a tiny mouthful of heaven. not too chewy, and very flavorful. i highly recommend this yummy treat. if you are familiar with the story of c.s. lewis' ""the lion, the witch, and the wardrobe"" - this is the treat that seduces edmund into selling out his brother and sisters to the witch.","this is a confection that has been around a few centuries. it is a light, pillowy citrus gelatin with nuts in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar. and it is a tiny mouthful of heaven. not too chewy, and very flavorful. i highly recommend this yummy treat. if you are familiar with the story of c.s. lewis' ""the lion, the witch, and the wardrobe"" this is the treat that seduces edmund into selling out his brother and sisters to the witch.","Natalia Corres ""Natalia Corres"""


In [8]:
# Add the new Profile Name

df = df.rename(columns={'ProfileClean':'ProfileName'})
df.head(3)

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,selftext_clean,ProfileName
0,1,B001E4KFG0,A3SGXH7AUHU8GW,1,1,5,1303862400,Good Quality Dog Food,i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.,i have bought several of the vitality canned dog food products and have found them all to be of good quality. the product looks more like a stew than a processed meat and it smells better. my labrador is finicky and she appreciates this product better than most.,delmartian
1,2,B00813GRG4,A1D87F6ZCVE5NK,0,0,1,1346976000,Not as Advertised,"product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as ""jumbo"".","product arrived labeled as jumbo salted peanuts...the peanuts were actually small sized unsalted. not sure if this was an error or if the vendor intended to represent the product as ""jumbo"".",dll pa
2,3,B000LQOCH0,ABXLMWJIXXAIN,1,1,4,1219017600,"""Delight"" says it all","this is a confection that has been around a few centuries. it is a light, pillowy citrus gelatin with nuts - in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar. and it is a tiny mouthful of heaven. not too chewy, and very flavorful. i highly recommend this yummy treat. if you are familiar with the story of c.s. lewis' ""the lion, the witch, and the wardrobe"" - this is the treat that seduces edmund into selling out his brother and sisters to the witch.","this is a confection that has been around a few centuries. it is a light, pillowy citrus gelatin with nuts in this case filberts. and it is cut into tiny squares and then liberally coated with powdered sugar. and it is a tiny mouthful of heaven. not too chewy, and very flavorful. i highly recommend this yummy treat. if you are familiar with the story of c.s. lewis' ""the lion, the witch, and the wardrobe"" this is the treat that seduces edmund into selling out his brother and sisters to the witch.","Natalia Corres ""Natalia Corres"""


### Introducing a new cleaning function to remove some of the brand-names and product-names.
#### for Instance, kcups and keurig were common and we wanted to see if removing these or generalizing them made the topics more insightful

In [9]:
import re
def product_clean(s):
    s = re.sub(r'kcups', "brand_name", s)
    s = re.sub(r'kcup', "brand_name", s)
    s = re.sub(r'keurig', "brand_name", s)
    s = re.sub(r'wolfgang puck', "product_name", s)
    s = re.sub(r"timmothy's", "brand_name", s)
    s = re.sub(r"emeril's", "brand_name", s)
    s = re.sub(r"coffees", "product_name", s)
    s = re.sub(r"coffee", "product_name", s)
    #s = re.sub(r'-', "", s)
    #s = re.sub(r'-', "", s)
    return str(s)

In [10]:
df["product_clean"] = ''
for i, row in df.iterrows():
    df.at[i, "product_clean"] = product_clean(row.selftext_clean)
    
df[df['ProductId']=='B006N3IG4K'].head(3)

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,selftext_clean,ProfileName,product_clean
8993,8994,B006N3IG4K,A5U24IWH64IFF,1,1,5,1302480000,Absolutely delicious coffee!,"i am a huge coffee drinker, and love the k-cups for making one cup quickly before leaving for work in the morning. while there are many choices, i prefer a bold, strong coffee first thing in the morning. wolfgang puck french roast is absolutely delicious, strong and not at all bitter, and makes a great first cup of coffee. i would highly recommend to all bold coffee drinkers!","i am a huge coffee drinker, and love the kcups for making one cup quickly before leaving for work in the morning. while there are many choices, i prefer a bold, strong coffee first thing in the morning. wolfgang puck french roast is absolutely delicious, strong and not at all bitter, and makes a great first cup of coffee. i would highly recommend to all bold coffee drinkers!",Kimdoll,"i am a huge product_name drinker, and love the brand_name for making one cup quickly before leaving for work in the morning. while there are many choices, i prefer a bold, strong product_name first thing in the morning. product_name french roast is absolutely delicious, strong and not at all bitter, and makes a great first cup of product_name. i would highly recommend to all bold product_name drinkers!"
8994,8995,B006N3IG4K,A1RVCWFP3SC3GU,1,1,5,1300233600,YUMMY,we always drink timmothy's or emeril's. thought we would try something different. not too many decafs to choose from. we were pleasantly surprised that this had a great taste with a full bodied flavor. i like my coffee on the milder side so i can even get 2 cups from 1 kcup.,we always drink timmothy's or emeril's. thought we would try something different. not too many decafs to choose from. we were pleasantly surprised that this had a great taste with a full bodied flavor. i like my coffee on the milder side so i can even get 2 cups from 1 kcup.,Cakediva,we always drink brand_name or brand_name. thought we would try something different. not too many decafs to choose from. we were pleasantly surprised that this had a great taste with a full bodied flavor. i like my product_name on the milder side so i can even get 2 cups from 1 brand_name.
8995,8996,B006N3IG4K,A12SO47JRQGUPR,1,1,4,1299801600,Good Stuff,good subtle flavored coffee for the mid-afternoon pick me up. a bit of sweetness without being overdone. wish it was available in whole bean as i like it stronger then the keurig cup can make it.,good subtle flavored coffee for the midafternoon pick me up. a bit of sweetness without being overdone. wish it was available in whole bean as i like it stronger then the keurig cup can make it.,"Thomas Smith ""tjaye""",good subtle flavored product_name for the midafternoon pick me up. a bit of sweetness without being overdone. wish it was available in whole bean as i like it stronger then the brand_name cup can make it.


## Feature Engineering using BERT

### This is where we begin to tokenize the dataframe, using spaCy

In [11]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [12]:
for i, row in df.iterrows():
    if i % 1000 == 0:  # this will act as a progress tracker, each 1,000 records will be marked as completed when it is printed, e.g. 1,000 rows completed when 1,000 is printed             
        print(i)
    if(row["product_clean"] and len(str(row["product_clean"])) <500000):
        doc = nlp(str(row["product_clean"]))
        adjectives = []
        nouns = []
        propn = []
        verbs = []
        lemmas = []

        for token in doc:
            lemmas.append(token.lemma_)
            if token.pos_ == "ADJ":
                adjectives.append(token.lemma_)
            if token.pos_ == "NOUN":
                nouns.append(token.lemma_)
            if token.pos_ == "PROPN":
                propn.append(token.lemma_)
            if token.pos_ == "VERB":
                verbs.append(token.lemma_)
                
        df.at[i, "selftext_lemma"] = " ".join(lemmas)
        df.at[i, "selftext_nouns"] = " ".join(nouns)
        df.at[i, "selftext_propn"] = " ".join(propn)
        df.at[i, "selftext_adjectives"] = " ".join(adjectives)
        df.at[i, "selftext_verbs"] = " ".join(verbs)
        df.at[i, "selftext_npav"] = " ".join(nouns+propn+adjectives+verbs)
        df.at[i, "no_tokens"] = len(lemmas)

0


  df.at[i, "selftext_lemma"] = " ".join(lemmas)
  df.at[i, "selftext_nouns"] = " ".join(nouns)
  df.at[i, "selftext_propn"] = " ".join(propn)
  df.at[i, "selftext_adjectives"] = " ".join(adjectives)
  df.at[i, "selftext_verbs"] = " ".join(verbs)
  df.at[i, "selftext_npav"] = " ".join(nouns+propn+adjectives+verbs)


1000
2000
3000
4000
5000
6000
7000
8000
9000


In [13]:
df[df['ProductId']=='B006N3IG4K'].head(10)

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,selftext_clean,ProfileName,product_clean,selftext_lemma,selftext_nouns,selftext_propn,selftext_adjectives,selftext_verbs,selftext_npav,no_tokens
8993,8994,B006N3IG4K,A5U24IWH64IFF,1,1,5,1302480000,Absolutely delicious coffee!,"i am a huge coffee drinker, and love the k-cups for making one cup quickly before leaving for work in the morning. while there are many choices, i prefer a bold, strong coffee first thing in the morning. wolfgang puck french roast is absolutely delicious, strong and not at all bitter, and makes a great first cup of coffee. i would highly recommend to all bold coffee drinkers!","i am a huge coffee drinker, and love the kcups for making one cup quickly before leaving for work in the morning. while there are many choices, i prefer a bold, strong coffee first thing in the morning. wolfgang puck french roast is absolutely delicious, strong and not at all bitter, and makes a great first cup of coffee. i would highly recommend to all bold coffee drinkers!",Kimdoll,"i am a huge product_name drinker, and love the brand_name for making one cup quickly before leaving for work in the morning. while there are many choices, i prefer a bold, strong product_name first thing in the morning. product_name french roast is absolutely delicious, strong and not at all bitter, and makes a great first cup of product_name. i would highly recommend to all bold product_name drinkers!","I be a huge product_name drinker , and love the brand_name for make one cup quickly before leave for work in the morning . while there be many choice , I prefer a bold , strong product_name first thing in the morning . product_name french roast be absolutely delicious , strong and not at all bitter , and make a great first cup of product_name . I would highly recommend to all bold product_name drinker !",product_name drinker brand_name cup work morning choice product_name thing morning roast cup product_name drinker,product_name,huge many bold strong first french delicious strong bitter great first bold,love make leave be prefer make recommend,product_name drinker brand_name cup work morning choice product_name thing morning roast cup product_name drinker product_name huge many bold strong first french delicious strong bitter great first bold love make leave be prefer make recommend,79.0
8994,8995,B006N3IG4K,A1RVCWFP3SC3GU,1,1,5,1300233600,YUMMY,we always drink timmothy's or emeril's. thought we would try something different. not too many decafs to choose from. we were pleasantly surprised that this had a great taste with a full bodied flavor. i like my coffee on the milder side so i can even get 2 cups from 1 kcup.,we always drink timmothy's or emeril's. thought we would try something different. not too many decafs to choose from. we were pleasantly surprised that this had a great taste with a full bodied flavor. i like my coffee on the milder side so i can even get 2 cups from 1 kcup.,Cakediva,we always drink brand_name or brand_name. thought we would try something different. not too many decafs to choose from. we were pleasantly surprised that this had a great taste with a full bodied flavor. i like my product_name on the milder side so i can even get 2 cups from 1 brand_name.,we always drink brand_name or brand_name . think we would try something different . not too many decafs to choose from . we be pleasantly surprised that this have a great taste with a full bodied flavor . I like my product_name on the milder side so I can even get 2 cup from 1 brand_name .,brand_name decafs taste flavor product_name milder side cup brand_name,brand_name,different many surprised great full bodied,drink think try choose have like get,brand_name decafs taste flavor product_name milder side cup brand_name brand_name different many surprised great full bodied drink think try choose have like get,57.0
8995,8996,B006N3IG4K,A12SO47JRQGUPR,1,1,4,1299801600,Good Stuff,good subtle flavored coffee for the mid-afternoon pick me up. a bit of sweetness without being overdone. wish it was available in whole bean as i like it stronger then the keurig cup can make it.,good subtle flavored coffee for the midafternoon pick me up. a bit of sweetness without being overdone. wish it was available in whole bean as i like it stronger then the keurig cup can make it.,"Thomas Smith ""tjaye""",good subtle flavored product_name for the midafternoon pick me up. a bit of sweetness without being overdone. wish it was available in whole bean as i like it stronger then the brand_name cup can make it.,good subtle flavor product_name for the midafternoon pick I up . a bit of sweetness without be overdone . wish it be available in whole bean as I like it strong then the brand_name cup can make it .,product_name bit sweetness bean brand_name cup,,good subtle midafternoon overdone available whole strong,flavor pick wish like make,product_name bit sweetness bean brand_name cup good subtle midafternoon overdone available whole strong flavor pick wish like make,39.0
8996,8997,B006N3IG4K,A86RUZGD22FDR,1,1,5,1299110400,Delivers what was promises,"wolfgang puck's chef's reserve colombian (dark roast) is basically what they said it will be. dark roast, but not too strong. acidity is on a low side, but with plenty of pleasant taste. at the same time you can drink this coffee black w/o the need for the sweetener and/or milk. this coffee is in european tradition and will please many people. i would recommend it after dinner, but not if you need a jolt to wake you up.","wolfgang puck's chef's reserve colombian (dark roast) is basically what they said it will be. dark roast, but not too strong. acidity is on a low side, but with plenty of pleasant taste. at the same time you can drink this coffee black w/o the need for the sweetener and/or milk. this coffee is in european tradition and will please many people. i would recommend it after dinner, but not if you need a jolt to wake you up.",Another coffee drinker,"product_name's chef's reserve colombian (dark roast) is basically what they said it will be. dark roast, but not too strong. acidity is on a low side, but with plenty of pleasant taste. at the same time you can drink this product_name black w/o the need for the sweetener and/or milk. this product_name is in european tradition and will please many people. i would recommend it after dinner, but not if you need a jolt to wake you up.","product_name 's chef 's reserve colombian ( dark roast ) be basically what they say it will be . dark roast , but not too strong . acidity be on a low side , but with plenty of pleasant taste . at the same time you can drink this product_name black w/o the need for the sweetener and/or milk . this product_name be in european tradition and will please many people . I would recommend it after dinner , but not if you need a jolt to wake you up .",chef roast roast acidity side plenty taste time product_name need sweetener milk product_name tradition people dinner jolt,product_name reserve colombian dark,dark strong low pleasant same black european many,say drink please recommend need wake,chef roast roast acidity side plenty taste time product_name need sweetener milk product_name tradition people dinner jolt product_name reserve colombian dark dark strong low pleasant same black european many say drink please recommend need wake,91.0
8997,8998,B006N3IG4K,A3LQS08K2OBI5Z,1,1,3,1298332800,"My mom said this was not as flavorful as other ""Extra Bolds""","this is the most useful info you will ever need to know about k-cups, and one that took me forever to discover:<br /><br />bold/extra bold coffee in terms of k-cups does not mean that the coffee itself is stronger, it only means that they included more grind in the cups to provide you with a stronger coffee.<br /><br />there! now you know.<br /><br />my mom goes for the extra bolds only because she has been addicted to caffeine for over 40 years now, and she's frugal and thinks this gives her more back for her buck (or rather mine, since i buy it for her).<br /><br />unfortunately, out of all the extra bold k-cups she's tried, she claims that the wolfgang puck is the weakest tasting. she's tried it in both the medium & small serving sizes format, and both have produced less-than-full-flavor drinks.<br /><br />if you are looking for a rich, flavorful, strong-tasting coffee, try the <a href=""http://www.amazon.com/gp/product/b001hkzz7a"">van houtte cafe honduras medium roast, extra bold coffee, 24-count k-cups for keurig brewers (pack of 2)</a>.","this is the most useful info you will ever need to know about kcups, and one that took me forever to discover:\n\nbold/extra bold coffee in terms of kcups does not mean that the coffee itself is stronger, it only means that they included more grind in the cups to provide you with a stronger coffee.\n\nthere! now you know.\n\nmy mom goes for the extra bolds only because she has been addicted to caffeine for over 40 years now, and she's frugal and thinks this gives her more back for her buck (or rather mine, since i buy it for her).\n\nunfortunately, out of all the extra bold kcups she's tried, she claims that the wolfgang puck is the weakest tasting. she's tried it in both the medium & small serving sizes format, and both have produced lessthanfullflavor drinks.\n\nif you are looking for a rich, flavorful, strongtasting coffee, try the )</a>.",J. Lim,"this is the most useful info you will ever need to know about brand_name, and one that took me forever to discover:\n\nbold/extra bold product_name in terms of brand_name does not mean that the product_name itself is stronger, it only means that they included more grind in the cups to provide you with a stronger product_name.\n\nthere! now you know.\n\nmy mom goes for the extra bolds only because she has been addicted to caffeine for over 40 years now, and she's frugal and thinks this gives her more back for her buck (or rather mine, since i buy it for her).\n\nunfortunately, out of all the extra bold brand_name she's tried, she claims that the product_name is the weakest tasting. she's tried it in both the medium & small serving sizes format, and both have produced lessthanfullflavor drinks.\n\nif you are looking for a rich, flavorful, strongtasting product_name, try the )</a>.","this be the most useful info you will ever need to know about brand_name , and one that take I forever to discover : \n\n bold / extra bold product_name in term of brand_name do not mean that the product_name itself be strong , it only mean that they include more grind in the cup to provide you with a strong product_name . \n\n there ! now you know . \n\n my mom go for the extra bold only because she have be addict to caffeine for over 40 year now , and she be frugal and think this give she more back for her buck ( or rather mine , since I buy it for she ) . \n\n unfortunately , out of all the extra bold brand_name she be try , she claim that the product_name be the weak tasting . she be try it in both the medium & small serve size format , and both have produce lessthanfullflavor drink . \n\n if you be look for a rich , flavorful , strongtaste product_name , try the ) < /a > .",info product_name term brand_name product_name grind cup product_name mom bold caffeine year buck brand_name product_name tasting size format lessthanfullflavor drink product_name,brand_name,useful bold bold strong more strong extra frugal extra bold weak medium small rich flavorful,need know take discover mean mean include provide know go addict think give buy try claim try serve produce look strongtaste try,info product_name term brand_name product_name grind cup product_name mom bold caffeine year buck brand_name product_name tasting size format lessthanfullflavor drink product_name brand_name useful bold bold strong more strong extra frugal extra bold weak medium small rich flavorful need know take discover mean mean include provide know go addict think give buy try claim try serve produce look strongtaste try,185.0
8998,8999,B006N3IG4K,A4L2ALOJ53R96,1,1,2,1297728000,"Weak, even on small cup setting",this coffee brews very weak - like a light roast. w.p. chef's reserve pods are just over half full. put pod on scale and it's 15% less coffee wgt than normal pods. stick with normal pods or extra bold if you like coffee that taste like coffee.,this coffee brews very weak like a light roast. w.p. chef's reserve pods are just over half full. put pod on scale and it's 15% less coffee wgt than normal pods. stick with normal pods or extra bold if you like coffee that taste like coffee.,Squib,this product_name brews very weak like a light roast. w.p. chef's reserve pods are just over half full. put pod on scale and it's 15% less product_name wgt than normal pods. stick with normal pods or extra bold if you like product_name that taste like product_name.,this product_name brew very weak like a light roast . w.p . chef 's reserve pod be just over half full . put pod on scale and it be 15 % less product_name wgt than normal pod . stick with normal pod or extra bold if you like product_name that taste like product_name .,product_name brew roast reserve pod half pod scale % wgt pod pod product_name taste,w.p . chef product_name,weak light full less product_name normal normal bold,put stick like,product_name brew roast reserve pod half pod scale % wgt pod pod product_name taste w.p . chef product_name weak light full less product_name normal normal bold put stick like,58.0
8999,9000,B006N3IG4K,AQ6SHOW0VMZGF,1,1,2,1297382400,Really Disappointed,"i wanted to like this coffee, i did. we've tried other wolfgang puck k-cups and have been pleasantly surprised. this just did not impress us. bitter aftertaste, and it has an almost sickeningly vanilla flavor. smells great while brewing, but just does not live up to it in the cup. i'm really disappointed because i now have a box and a half of this product that we really do not want to drink! great price though.","i wanted to like this coffee, i did. we've tried other wolfgang puck kcups and have been pleasantly surprised. this just did not impress us. bitter aftertaste, and it has an almost sickeningly vanilla flavor. smells great while brewing, but just does not live up to it in the cup. i'm really disappointed because i now have a box and a half of this product that we really do not want to drink! great price though.","K. Padgett ""familyof5""","i wanted to like this product_name, i did. we've tried other product_name brand_name and have been pleasantly surprised. this just did not impress us. bitter aftertaste, and it has an almost sickeningly vanilla flavor. smells great while brewing, but just does not live up to it in the cup. i'm really disappointed because i now have a box and a half of this product that we really do not want to drink! great price though.","I want to like this product_name , I do . we 've try other product_name brand_name and have be pleasantly surprised . this just do not impress we . bitter aftertaste , and it have an almost sickeningly vanilla flavor . smell great while brewing , but just do not live up to it in the cup . I be really disappointed because I now have a box and a half of this product that we really do not want to drink ! great price though .",product_name product_name brand_name aftertaste vanilla flavor brewing cup box half product price,,other surprised bitter great disappointed great,want like do try impress have smell live have want drink,product_name product_name brand_name aftertaste vanilla flavor brewing cup box half product price other surprised bitter great disappointed great want like do try impress have smell live have want drink,87.0
9000,9001,B006N3IG4K,A1YKQFG1VU6AZK,1,1,5,1297209600,Drink The Spirit of Aloha!,"i purchased these for my mom and she loves them! she does not like bold coffee flavors, but is more into the sweet and more flavorful varieties. this one combines the taste of the islands with a smooth, nutty flavor. amazon had a great price on these and this is something that i can actually recommend!","i purchased these for my mom and she loves them! she does not like bold coffee flavors, but is more into the sweet and more flavorful varieties. this one combines the taste of the islands with a smooth, nutty flavor. amazon had a great price on these and this is something that i can actually recommend!",D. Prescott,"i purchased these for my mom and she loves them! she does not like bold product_name flavors, but is more into the sweet and more flavorful varieties. this one combines the taste of the islands with a smooth, nutty flavor. amazon had a great price on these and this is something that i can actually recommend!","I purchase these for my mom and she love they ! she do not like bold product_name flavor , but be more into the sweet and more flavorful variety . this one combine the taste of the island with a smooth , nutty flavor . amazon have a great price on these and this be something that I can actually recommend !",mom product_name flavor variety one taste island flavor price,amazon,bold sweet flavorful smooth nutty great,purchase love like combine have recommend,mom product_name flavor variety one taste island flavor price amazon bold sweet flavorful smooth nutty great purchase love like combine have recommend,65.0
9001,9002,B006N3IG4K,A1KE7XXXLYIXL,1,1,1,1296864000,"Great taste, lousy cup","this is a good decaf coffee, but i've never before had a k cup dump the contents into my cup. it's extremely irritating to deal with grounds in the cup and machine when deliberately choosing a system that avoids the mess.","this is a good decaf coffee, but i've never before had a k cup dump the contents into my cup. it's extremely irritating to deal with grounds in the cup and machine when deliberately choosing a system that avoids the mess.",Patricia,"this is a good decaf product_name, but i've never before had a k cup dump the contents into my cup. it's extremely irritating to deal with grounds in the cup and machine when deliberately choosing a system that avoids the mess.","this be a good decaf product_name , but I 've never before have a k cup dump the content into my cup . it be extremely irritate to deal with ground in the cup and machine when deliberately choose a system that avoid the mess .",decaf cup content cup ground cup machine system mess,product_name k,good,have dump irritate deal choose avoid,decaf cup content cup ground cup machine system mess product_name k good have dump irritate deal choose avoid,46.0
9002,9003,B006N3IG4K,A3TQW5KBBPZHNC,1,1,2,1296518400,"If you want a flavored coffee, okay... but otherwise YUK",warning: this is a strong flavor.<br /><br />i thought that this would be blue mountain-y... oh no. it is like... rum flavored or something. yuk.,warning: this is a strong flavor.\n\ni thought that this would be blue mountainy... oh no. it is like... rum flavored or something. yuk.,"K. E Pearce ""kpearce""",warning: this is a strong flavor.\n\ni thought that this would be blue mountainy... oh no. it is like... rum flavored or something. yuk.,warn : this be a strong flavor . \n\n I think that this would be blue mountainy ... oh no . it be like ... rum flavor or something . yuk .,flavor mountainy rum,,strong blue,warn think flavor,flavor mountainy rum strong blue warn think flavor,32.0


## Topic Modeling using LDA

### Installing a visualization package called pyLDAvis == python LDA visualization == LDA is Latent Dirichlet Allocation

In [14]:
!pip install pyLDAvis



### Importing and making sure we have the right packages, libraries, modules

In [15]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [16]:
nlp = spacy.load('en_core_web_sm')
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/markjones/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Adding a couple of stop words to the NLTK stopwords corpus

In [17]:
# NLTK Stop words
from nltk.corpus import stopwords

new_words = ['product_name','brand_name']



stop_words = stopwords.words('english')+ new_words
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Section for Controls of the below LDA Model
#### Creating some variables that can be used to fine-tune the model or look at different segments of the reviews.

In [18]:
# introduce a variable so we don't have to hard-code in functions

product = 'B006N3IG4K'

# introduce a rating variable (create 2 for different purposes)

great_rating = 4
average_rating = 3

# introduce a helpfulness variable (need to create a new field / column in the dataframe)

df['helpfulness_pct'] = df['HelpfulnessNumerator']//df['HelpfulnessDenominator']

very_helpful = 0.8
average_helpful = 0.6

# Number of Topics Variable

nbr_topics = 20

#### Some pre-defined filters we can use

In [19]:
# some different options to filter down the dataframe here

# looking at the most helpful reviews for a given product
# df1 = df[(df['ProductId']==product) & (df['helpfulness_pct']>=very_helpful)]

# looking at the most helpful reviews for a given product and for the ones that are 4-stars and above
df1 = df[(df['ProductId']==product) & (df['Score']>=great_rating) & (df['helpfulness_pct']>=very_helpful)]

# looking at the most helpful reviews for ALL products and for reviews with 4-stars and above
# df1 = df[(df['Score']>=great_rating) & (df['helpfulness_pct']>=very_helpful)]

In [20]:
# Convert to list
data = df1.selftext_lemma.values.tolist()

In [21]:
pprint(data[:1])

['I be a huge product_name drinker , and love the brand_name for make one cup '
 'quickly before leave for work in the morning .   while there be many choice '
 ', I prefer a bold , strong product_name first thing in the morning .   '
 'product_name french roast be absolutely delicious , strong and not at all '
 'bitter , and make a great first cup of product_name .   I would highly '
 'recommend to all bold product_name drinker !']


In [22]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['be', 'huge', 'product_name', 'drinker', 'and', 'love', 'the', 'brand_name', 'for', 'make', 'one', 'cup', 'quickly', 'before', 'leave', 'for', 'work', 'in', 'the', 'morning', 'while', 'there', 'be', 'many', 'choice', 'prefer', 'bold', 'strong', 'product_name', 'first', 'thing', 'in', 'the', 'morning', 'product_name', 'french', 'roast', 'be', 'absolutely', 'delicious', 'strong', 'and', 'not', 'at', 'all', 'bitter', 'and', 'make', 'great', 'first', 'cup', 'of', 'product_name', 'would', 'highly', 'recommend', 'to', 'all', 'bold', 'product_name', 'drinker']]


In [23]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['be', 'huge', 'product_name', 'drinker', 'and', 'love', 'the', 'brand_name', 'for', 'make', 'one', 'cup', 'quickly', 'before', 'leave', 'for', 'work', 'in', 'the', 'morning', 'while', 'there', 'be', 'many', 'choice', 'prefer', 'bold', 'strong', 'product_name', 'first', 'thing', 'in', 'the', 'morning', 'product_name', 'french', 'roast', 'be', 'absolutely', 'delicious', 'strong', 'and', 'not', 'at', 'all', 'bitter', 'and', 'make', 'great', 'first', 'cup', 'of', 'product_name', 'would', 'highly', 'recommend', 'to', 'all', 'bold', 'product_name', 'drinker']


In [24]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [25]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['huge', 'drinker', 'love', 'make', 'cup', 'quickly', 'leave', 'work', 'morning', 'many', 'choice', 'prefer', 'bold', 'strong', 'first', 'thing', 'morning', 'french', 'roast', 'absolutely', 'delicious', 'strong', 'bitter', 'make', 'great', 'first', 'cup', 'highly', 'recommend', 'bold', 'drinker']]


### Taking the lemmatize tokens and beginning to aggregate them based on frequency

In [26]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 2), (5, 1), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1)]]


In [27]:
id2word[0]

'absolutely'

In [28]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('absolutely', 1),
  ('bitter', 1),
  ('bold', 2),
  ('choice', 1),
  ('cup', 2),
  ('delicious', 1),
  ('drinker', 2),
  ('first', 2),
  ('french', 1),
  ('great', 1),
  ('highly', 1),
  ('huge', 1),
  ('leave', 1),
  ('love', 1),
  ('make', 2),
  ('many', 1),
  ('morning', 2),
  ('prefer', 1),
  ('quickly', 1),
  ('recommend', 1),
  ('roast', 1),
  ('strong', 2),
  ('thing', 1),
  ('work', 1)]]

## Building the LDA Topic Model

In [29]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=nbr_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [30]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.036*"flavor" + 0.024*"love" + 0.024*"taste" + 0.018*"brand" + '
  '0.018*"green" + 0.018*"try" + 0.018*"variety" + 0.018*"add" + 0.018*"cup" + '
  '0.012*"many"'),
 (1,
  '0.027*"cup" + 0.023*"flavor" + 0.019*"taste" + 0.015*"love" + 0.015*"find" '
  '+ 0.015*"try" + 0.015*"buy" + 0.012*"think" + 0.012*"perfect" + '
  '0.012*"favorite"'),
 (2,
  '0.047*"strength" + 0.032*"many" + 0.032*"fuel" + 0.032*"jet" + '
  '0.032*"bitter" + 0.016*"brand" + 0.016*"flavor" + 0.016*"bold" + '
  '0.016*"mediumcup" + 0.016*"read"'),
 (3,
  '0.040*"flavor" + 0.027*"coconut" + 0.027*"cup" + 0.014*"blend" + '
  '0.014*"think" + 0.014*"light" + 0.014*"go" + 0.014*"great" + 0.014*"smooth" '
  '+ 0.014*"brew"'),
 (4,
  '0.029*"nice" + 0.019*"smooth" + 0.019*"much" + 0.019*"still" + 0.019*"cup" '
  '+ 0.019*"hazelnut" + 0.019*"brew" + 0.019*"flavor" + 0.019*"day" + '
  '0.019*"try"'),
 (5,
  '0.030*"work" + 0.030*"time" + 0.020*"make" + 0.020*"office" + 0.020*"go" + '
  '0.010*"coconut" + 0.010*"co

In [31]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.819393699069979

Coherence Score:  0.5073297212207668


### Topic Model Visualization

In [32]:
# Visualize the topics
# On the sliding scale, the lower the value of lambda, the more exclusive the terms become to that topic,
    # the higher the value, the most terms that are shared between topics

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds')
vis



In [33]:
import pickle
pickle_out = open("model.pkl", "wb")
pickle.dump(lda_model, pickle_out)
pickle_out.close()

In [34]:
# Save the report

pyLDAvis.save_html(vis, 'index.html')

### Heroku Web App is live: https://reviews-analysis-staging-e5e8ebceb152.herokuapp.com/
### Github directory and files are established: https://github.com/oklawyer71/AmazonReviews/tree/main/app
#### - Flask back-end setup
#### - the pyLDAvis was saved as a html file and inserted into the index page
### Next Steps:
#### - Begin to add some more content to the website:
        - Some of the original reviews
        - Add in some of the original wordclouds
        - Some of the other content within this notebook
        - How we determined the number of topics
        - Style the website with some fonts, etc.
        - Other visuals
        - Potentially take a few of the different products and create pages for them
        - Potentially take a stab at the different topics and naming them
#### - Begin to craft the jupyter notebook tutorial that will be used for the final project submission