In [1]:
import re
import string
import itertools
from collections import Counter

import pandas as pd
import numpy as np

# import nltk
# nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import spacy

from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from tqdm.auto import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/Module_5_Lecture_1_Class_amazon_product_reviews.csv', index_col='Id')

In [3]:
# Dataset preview

df.head(3)

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [4]:
# Basic dataset info 

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 568454 entries, 1 to 568454
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   ProductId               568454 non-null  object
 1   UserId                  568454 non-null  object
 2   ProfileName             568428 non-null  object
 3   HelpfulnessNumerator    568454 non-null  int64 
 4   HelpfulnessDenominator  568454 non-null  int64 
 5   Score                   568454 non-null  int64 
 6   Time                    568454 non-null  int64 
 7   Summary                 568427 non-null  object
 8   Text                    568454 non-null  object
dtypes: int64(4), object(5)
memory usage: 43.4+ MB


In [5]:
# For this lesson, let's ignore neutral reviews
# We'll consider reviews with a score 3 as neutral

df = df.loc[df['Score']!=3]

df.shape

(525814, 9)

In [6]:
df['sentiment'] = [1 if score in [4, 5] else 0 for score in df['Score']]

In [7]:
# Number of identical records

df.duplicated().sum()

255

In [8]:
# Droping duplicated records

df = df.drop_duplicates().reset_index(drop=True)

# Number of unique records

df.shape

(525559, 10)

In [9]:
# Checking for the identical reviews of different versions of the same product

df.groupby(['UserId', 'Time', 'Text']).count().sort_values('ProductId', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,sentiment
UserId,Time,Text,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A3TVZM3ZIXG8YW,1291420800,"This review will make me sound really stupid, but whatever. I don't really care as long as people find out what's real and can avoid my mistakes.<br /><br />I got my wonderful little sweet Bella Bean when she was a few days shy of three years old. She had been bounced around from house to house and eating whatever was cheap. I have had cats around me my entire life, for about twenty-five years now. My mother always just fed them whatever, the kinds of food you buy in the supermarket - Friskies, Nine Lives, Kit & Kaboodle, stuff like that. And our cats were always fine, at least in terms of their eating habits. They would eat in the morning, stop when they were done, come back, and eat some more when they got hungry.<br /><br />My housemate at the time was working for Hill's and assured me that this was the best food ever made, so great, so on and so forth. I now know that she is an utter buffoon, but I initially trusted her judgment, which is so unfortunate because she doesn't think. She also had plenty of coupons for free or deeply discounted bags, which made it a much more attractive choice.<br /><br />I first tried feeding the little Bean an unmeasured amount of Science Diet in a bowl, but that didn't work, as she would devour it in one sitting. So then I took to measuring it, and she did the same thing. Then I started parsing it out to twice a day. That didn't work either, because she would start going crazy in the middle of the day, running around, intentionally destroying things, deliberately spilling her water, crying, etc., until she got more food. So then I split it into three servings. Same thing. Then it got to be four servings. That was a little better, but it was too much maintenance and unrealistic to be around every day to feed her four times. So then it went back to three. All the while, I was trying to reduce the amount of food I was feeding her to less than 3/4 of a cup because she was a little chubby. Reducing was hell. She became even more hungry, but I figured she would get used to it. Not really.<br /><br />For over a year, she would wake me up every morning looking for food in a serious way, knocking things off my desk, ripping up any paper she could find, scratching at the door and committing general acts of mischief. As soon as she got food, she was back to her sweet self, but only for three or four hours.<br /><br />We thought she was bored, we thought she was a little nutty, and maybe even had a kitty eating disorder. She always wanted food. It was kind of funny but in the end it was just sad.<br /><br />A few weeks back, we took her to a new (good, non-money factory) vet for her second checkup since I've had her. I talked with him about her being always hungry. He asked what she ate, and I told him the adult indoor Science Diet. Without saying as much, he basically told me that this food is garbage and I should look for something else. He said cat food should have a protein followed by a carbohydrate as the first two ingredients. Science Diet does, in a very loose, by-product kind of way - ground up slaughterhouse leftovers and corn dust. Then they put a bunch of vitamins in it to make it ""healthy,"" instead of just using good ingredients from the beginning. Not that I care about spending money on the Bean, but this food is way too expensive for what it is.<br /><br />So we began transitioning her onto Wellness indoor formula about two weeks ago. She is still eating 50 percent Science Diet with 50 percent Wellness (you really shouldn't just give a cat different food one day out of the blue) but SHE NEVER FREAKS OUT ANYMORE. It's amazing. And it's 100 percent because she is eating real food now, along with that sawdust and chicken hearts I still regrettable have to feed her. We have her down to eating twice a day, only 1/3 cup in TOTAL. She was eating three times a day, 5/8 of a cup in total. Now she eats some in the morning, walks away, eats some more a few hours later, and then looks for dinner about twelve hours after her initial feeding. No more knocking things over, no more trashing Dad's papers on the desk, no more howling, no more deliberately spilled water on the floor. It's incredible.<br /><br />I feel so bad that I was doing this to her for so long. We really thought she was just being dramatic or whatever. But no, she was genuinely hungry because she wasn't eating any real food.<br /><br />Do your cat a favor - buy her or him so food made with real ingredients, things you would eat - Wellness, Halo, Innova, Evo, whatever. Figure it out for yourself, but please don't feed your cat this. It's garbage.",182,182,182,182,182,182,182
A36JDIN9RAAIEC,1292976000,"I have two cats, one 6 and one 2 years old. Both are indoor cats in excellent health. I saw the negative review and talked to my vet about it. I've also asked a number of veterinary professionals what to feed my cats and they all answer the same thing: Science Diet. Sure, you'll see stories of how one person's cat had issues, but even if that's 100% true, it's 1 case out of millions. Science and fact aren't based on someone's experience.<br /><br />So my point is, I love my cats and I'm very concerned about their health. I trust people who actually have medical degrees and experience with a wide range of animals. My only caution is do not fall for some hype or scare tactic that recommends some unproven or untested food or some fad diet for your pet. Don't listen to me, don't listen to the negative review. ASK YOUR VET what they recommend, and follow their instructions. My guess is you'll end up buying the Science Diet anyhow.",51,51,51,51,51,51,51
A1TMAVN4CEM8U8,1336348800,"Diamond Almonds<br />Almonds are a good source of magnesium. One ounce contain approximately 20% of the RDA for an adult recommended dietary allowance (RDA)<br />Recommended Dietary Allowance (RDA) the amounts of selected nutrients considered adequate to meet the known nutrient needs of healthy people. The RDA is based on scientific knowledge and has been presented by a committee of the Food and Nutrition Board (FNB) of the National Academy of Sciences (NAS). The Canadian equivalent is the Recommended Nutrient Intakes. RDA is generally accepted throughout the world as a valid source of information. At least 40 different nations have as well as organizations have published standards similar to the RDA.<br /><br />Magnesium is the fourth most abundant mineral in the body and is essential to good health. Approximately 50% of total body magnesium is found in bone. The other half is found predominantly inside cells of body tissues and organs. Only 1% of magnesium is found in blood, but the body works very hard to keep blood levels of magnesium constant.<br />Magnesium is needed for more than 300 biochemical reactions in the body. It helps maintain normal muscle and nerve function, keeps heart rhythm steady, supports a healthy immune system, and keeps bones strong. Magnesium also helps regulate blood sugar levels, promotes normal blood pressure, and is known to be involved in energy metabolism and protein synthesis. There is an increased interest in the role of magnesium in preventing and managing disorders such as hypertension, cardiovascular disease, and diabetes. Dietary magnesium is absorbed in the small intestines. Are you taking enough magnesium in the diet? The recommended daily allowance is 300mgs/day for men, 350mgs for women. Magnesium is extremely safe by mouth - too much simply causes diarrhea. Try increasing the amount of magnesium you take by mouth until it causes diarrhea, then reduce the dose slightly so it does not. This is called taking magnesium to bowel tolerance (just like using vitamin C to bowel tolerance.<br /><br />Almonds are a good source of Fiber.<br />One ounce contains approximately 12% of the RDA for an adult.<br />So, in summary, Almonds are a very healthy snack, so what's left to say?<br /><br />Two things:<br />1st be cautious about ""bowel tolerance"" or ""happy bowels"" if you prefer.<br />2nd Taste. I realize that taste is a personal thing. That's why they make chocolate, strawberry, and vanilla.<br /><br />Here's my current favorites""<br /><br /> 1st Blue Diamond Dark mint dark chocolate (reminds me of the chocolate mint Girl Scout Cookies with less sugar).<br /><br />2nd Blue diamond Dark Chocolate Oven Roasted<br />3rd Blue diamond Blue Butter Toffee<br />4thd Blue Diamond Almonds Bold Habanero BBQ (reminds me of BBQ potato chips again with less sugar).<br />5th Bold Blazin Buffalo Wing<br />6th Blue Diamond Almonds Cinnamon Brown Sugar<br />7th Blue Diamond Almonds Lime `n Chili<br />8th Blue Diamond Almonds Smokehouse<br />9th Bold Wasabi & Soy Sauce<br /><br />But that's just me: you might have a completely different take on these flavors.<br /> 4th and 5th are very close.<br /><br />Gunner<br />May, 2012",43,43,43,43,43,43,43
A1UQBFCERIP7VJ,1321401600,"Stash Chamomile Herbal Tea is tea bags with dried, crushed chamomile flowers.<br /><br />Honestly, as far as I'm concerned, chamomile is chamomile. I suppose that when you use loose flowers you get sediment in the cup and I have seen this happen with some brands of chamomile tea bags, these don't do that.<br /><br />I have also had organic chamomile (this isn't) but I can't taste the difference.<br /><br />Each bag is individually packaged and that is either good or bad depending on how and where you use it and if you are concerned about excessive packaging.<br /><br />If you like chamomile tea, this is a good choice.",38,38,38,38,38,38,38
A29JUMRL1US6YP,1278201600,"The pet food industry can be one of the most infuriating as you start doing research and discovering what trash manufacturers (and the vets who get paid off by the manufacturers) push onto the unsuspecting public. For reference, don't ask your vet - do some Internet research at sites like, for example, [...]. What you'll find is that even the supposed ""high-end"" lines from basically every major supermarket label are horrible for your fuzzy friends. People who trust the marketing or trust their vets overpay and feed their cats foods that are full of carcinogens, cheap ingredients that are inappropriate for a cat's digestive system, and byproducts that provide nothing of value and are basically a way to make cheap profit off of literal garbage. It's a tragedy, as these same cats end up with shorter lifespans, kidney problems, UTIs, cancer, and a host of other issues, all of which could have been avoided by a high quality food.<br /><br />This is an A-Grade food. What you'll find here is very high protein content, many meat meals, potatoes, salmon oil, flax seed, and no grains, which is great as a cat is not an omnivore, it's a carnivore, and it simply does not need grains. It has 50 percent protein, which is among the highest you'll see in cat food. The first 4 ingredients are all meats.<br /><br />What you will NOT find here is the kind of junk you'll see in things like Purina One and Science Diet - byproducts (aka beaks, feathers, bird feet), brewers rice (aka the waste product from making beer), wheat flour (a leading cause of pet allergies), corn gluten (a cheap waste of space), and things like ""liver flavor"" (because they don't want to pay for the real thing). I didn't just pull those ingredients out of the air - I was reading the ingredients list of Purina One. It's pathetic.<br /><br />Don't feed your cats trash. Pay a little more to feed them good food now, and they'll lead longer, healthier, more satisfied lives as a result. It's worth it.",35,35,35,35,35,35,35
A25C5MVVCIYT5D,1304726400,"I understand all the complaints about Science Diet. Believe me I avoided this food for quite awhile based on that information. I'm not thrilled about the filler, would prefer a food with no by-products, and yes in general think it is over priced. But my cat was throwing up everything else. And I do mean EVERYTHING else. I have tried so many top of the line foods for him and every single one he threw up. I took him to the vet, had all the xrays done, all the blood work done, put him on antibotics just in case... nothing. He is a perfectly healthy cat except that he was throwing up food 2 to 3 times a week. I talked to my mom who had 2 Siamese cats that lived into thier late teens and she said she fed them Science Diet. In fact, one of them had urninary tract problems and the SD was the only food that stopped his problems. So yes, I understand everything everyone is saying about why SD is so horrible... yet I can't argue with the results. I fed my cat this and it's been 3 weeks with no vomiting. My mom fed it to her cats and they lived long, healthy lives. I just can't argue with the results. And even if there were risks it's worth it not to have my poor kitty completely emptying his stomach 2-3 times a week from food that is making him sick. If your cat keeps throwing up, you've had the vet check him and there is nothing wrong and no other food is working... try this food. It really did work for my cat.",34,34,34,34,34,34,34
A29JUMRL1US6YP,1278201600,"The pet food industry can be one of the most infuriating as you start doing research and discovering what trash manufacturers (and the vets who get paid off by the manufacturers) push onto the unsuspecting public. For reference, don't ask your vet - do some Internet research at sites like, for example, [...]. What you'll find is that even the supposed ""high-end"" lines from basically every major supermarket label are horrible for your fuzzy friends. People who trust the marketing or trust their vets overpay and feed their cats foods that are full of carcinogens, cheap ingredients that are inappropriate for a cat's digestive system, and byproducts that provide nothing of value and are basically a way to make cheap profit off of literal garbage. It's a tragedy, as these same cats end up with shorter lifespans, kidney problems, UTIs, cancer, and a host of other issues, all of which could have been avoided by a high quality food.<br /><br />This is an A-Grade food. What you'll find here is very high protein content, many meat meals, good carbs and oils, and no grains, which is great as a cat is not an omnivore, it's a carnivore, and it simply does not need grains. It has 50 percent protein, which is among the highest you'll see in cat food. The first 4 ingredients are all meats.<br /><br />What you will NOT find here is the kind of junk you'll see in things like Purina One and Science Diet - byproducts (aka beaks, feathers, bird feet), brewers rice (aka the waste product from making beer), wheat flour (a leading cause of pet allergies), corn gluten (a cheap waste of space), and things like ""liver flavor"" (because they don't want to pay for the real thing). I didn't just pull those ingredients out of the air - I was reading the ingredients list of Purina One. It's pathetic.<br /><br />Don't feed your cats trash. Pay a little more to feed them good food now, and they'll lead longer, healthier, more satisfied lives as a result. It's worth it.",34,34,34,34,34,34,34
A24PZR4W555WQI,1294617600,My dogs and I love this food. They never leave a trace and I am happy that it is healthy for them. I mix it up with the Wellness Super 5 mix.,28,28,28,28,28,28,28
A3PJZ8TU8FDQ1K,1231718400,"I'm addicted to salty and tangy flavors, so when I opened my first bag of Sea Salt & Vinegar Kettle Brand chips I knew I had a perfect complement to my vegetable trays of cucumber, carrot, celery and cherry tomatoes. Skip the dip; balance the tangy chips by alternating bites of raw vegetable.<br /><br />As an Oregonian, I'm proud to share these delectable snacks with friends, especially those living outside our state and who haven't experienced gourmet chips. I tell them Kettle Brand does for potato chips what microbrews did for beer.<br /><br />Kettle Brand potato chips are unmistakable--a light gold color, rich flavor and amazing crunch. Kettle Brand chips are also a healthier snacking option than the major chip brands. Kettle Brand chips don't have trans fats, MSG or artificial flavors and colorings. The company also has a line of organic potato chips and all of their products are certified Kosher.<br /><br />I also recommend <a href=""http://www.amazon.com/gp/product/B000G6MBV4"">Kettle Chips Honey Dijon</a> and <a href=""http://www.amazon.com/gp/product/B000G6Q4GM"">Kettle Chips Spicy Thai</a>.<br /><br />Annette Solomon, a reporter for the Salem Statesman Journal recently noted that a glass of wine goes nicely with these chips. Solomon wrote, ""...you could be missing out on a wonderful pairing. These chips are spicy, so you would want to select a semi-sweet white wine. Also, a moderate amount of acid will subdue the strong flavors of ginger, lime, garlic and cilantro without over-powering them. Classically, a German-style Riesling fits these parameters perfectly.""",28,28,28,28,28,28,28
A3A1OA237FOZFK,1296950400,"I was a little hesitant to try these, especially after reading such mixed reviews although overall they were positive. However, I liked the idea that they are healthier than regular chips and I figured they would be great for school lunches, quick snacks, etc. I decided to give them a try and figured if we did not like them, they could be donated. I ordered a mixed case the first time around to see which flavors we liked more (or less) and thought that would give us a good sampling. After deciding we liked most of the flavors, we decided to try other ones as well.<br /><br />Despite the flavor preferences, these chips do not taste like cardboard. At first taste, they are a little crunchier and thicker than you might expect. I think this is a good thing, because it means they are not heavily-laden with oil. The 0.8 oz bags are 100 calories each as well, which is much better than the other alternatives and gives you the enjoyment of chips without the extra calories, trans fat, etc.<br /><br />I have included a summary of opinions below from myself, friends and family for each flavor:<br /><br />Sea Salt & Vinegar - Great, the absolute favorite flavor for everyone<br />BBQ - Very good, I'm not crazy about BBQ to begin with but would say this flavor is ""sweet"" BBQ. Everyone else loved them.<br />Cheddar - Very Good, probably close in preference to the BBQ<br />Original Potato - Good, but a little salty (seems to fluctuate somewhat between bags)<br />Salt & Pepper - Good, everyone liked them but preferred other flavors more<br />Parmesan Garlic - Good, but overall was 50/50. You definitely taste more parmesan than garlic and most people were expecting the latter.<br />Sour Cream & Onion - Not so good, no one really liked these enough to want more or to even finish the bag<br /><br />I know taste is a very subjective thing, but I hope this review helps someone decide to give these a try. I did get them at a reduced price through subscribe & save, and shortly after my first order, Amazon had them on the Gold Box for an even better price. I think the average price per bag came out to approximately forty six cents a bag and had free shipping under my prime account.",28,28,28,28,28,28,28


In [10]:
search_text = "I have two cats, one 6 and one 2 years old. Both are indoor cats in excellent health. I saw the negative review and talked to my vet about it. I've also asked a number of veterinary professionals what to feed my cats and they all answer the same thing: Science Diet. Sure, you'll see stories of how one person's cat had issues, but even if that's 100% true, it's 1 case out of millions. Science and fact aren't based on someone's experience.<br /><br />So my point is, I love my cats and I'm very concerned about their health. I trust people who actually have medical degrees and experience with a wide range of animals. My only caution is do not fall for some hype or scare tactic that recommends some unproven or untested food or some fad diet for your pet. Don't listen to me, don't listen to the negative review. ASK YOUR VET what they recommend, and follow their instructions. My guess is you'll end up buying the Science Diet anyhow."
duplicates_example = df.loc[
    (df['UserId']=='A36JDIN9RAAIEC') &
    (df['Time']==1292976000) &
    (df['Text']==search_text)
]

duplicates_example

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,sentiment
48291,B003ANFMY8,A36JDIN9RAAIEC,Jon,3,3,5,1292976000,"Great product, but trust your vet not the hype","I have two cats, one 6 and one 2 years old. Bo...",1
48301,B003ANFMY8,A36JDIN9RAAIEC,Jon,2,2,5,1292976000,"Great product, but trust your vet not the hype","I have two cats, one 6 and one 2 years old. Bo...",1
48302,B003ANFMY8,A36JDIN9RAAIEC,Jon,2,2,5,1292976000,Don't fall prey to fads and anecdotal reviews,"I have two cats, one 6 and one 2 years old. Bo...",1
88934,B003WK0D8O,A36JDIN9RAAIEC,Jon,3,3,5,1292976000,"Great product, but trust your vet not the hype","I have two cats, one 6 and one 2 years old. Bo...",1
88944,B003WK0D8O,A36JDIN9RAAIEC,Jon,2,2,5,1292976000,"Great product, but trust your vet not the hype","I have two cats, one 6 and one 2 years old. Bo...",1
88945,B003WK0D8O,A36JDIN9RAAIEC,Jon,2,2,5,1292976000,Don't fall prey to fads and anecdotal reviews,"I have two cats, one 6 and one 2 years old. Bo...",1
89993,B0002MLA5K,A36JDIN9RAAIEC,Jon,3,3,5,1292976000,"Great product, but trust your vet not the hype","I have two cats, one 6 and one 2 years old. Bo...",1
90003,B0002MLA5K,A36JDIN9RAAIEC,Jon,2,2,5,1292976000,"Great product, but trust your vet not the hype","I have two cats, one 6 and one 2 years old. Bo...",1
90004,B0002MLA5K,A36JDIN9RAAIEC,Jon,2,2,5,1292976000,Don't fall prey to fads and anecdotal reviews,"I have two cats, one 6 and one 2 years old. Bo...",1
133472,B003MA8P02,A36JDIN9RAAIEC,Jon,3,3,5,1292976000,"Great product, but trust your vet not the hype","I have two cats, one 6 and one 2 years old. Bo...",1


In [11]:
# Droping the same reviews

df = df.drop_duplicates(subset={"UserId", "Time","Text"})

# Final size 

df.shape

(364133, 10)

In [12]:
# Contractions. Source http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python

contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [13]:
# get stop-words from the nltk library
# using set to make words search faster

stop_words = set(stopwords.words('english')).union({'also', 'would', 'much', 'many'})

In [14]:
negations = {
    'aren',
    "aren't",
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'isn',
    "isn't",
    'mightn',
    "mightn't",
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'no',
    'nor',
    'not',
    'shan',
    "shan't",
    'shouldn',
    "shouldn't",
    'wasn',
    "wasn't",
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't"
}

In [15]:
# removing negations from the stop-words list

stop_words = stop_words.difference(negations)

In [16]:
stemmer = PorterStemmer()

In [17]:
nlp = spacy.load("en_core_web_sm", disable = ['parser','ner'])

In [18]:
# function to clean text
def normalize_text(raw_review):
    
    # Remove html tags
    text = re.sub("<[^>]*>", " ", raw_review) # match <> and everything in between. [^>] - match everything except >
    
    # Remove emails
    text = re.sub("\S*@\S*[\s]+", " ", text) # match non-whitespace characters, @ and a whitespaces in the end
    
    # remove links
    text = re.sub("https?:\/\/.*?[\s]+", " ", text) # match http, s - zero or once, //, 
                                                    # any char 0-unlimited, whitespaces in the end
        
     # Convert to lower case, split into individual words
    text = text.lower().split()
    
    # Replace contractions with their full versions
    text = [contractions.get(word) if word in contractions else word 
            for word in text]
   
    # Re-splitting for the correct stop-words extraction
    text = " ".join(text).split()    
    
    # Remove stop words
    text = [word for word in text if not word in stop_words]

    text = " ".join(text)
    
    # Remove non-letters        
    text = re.sub("[^a-zA-Z' ]", "", text) # match everything except letters and '


    # Stem words. Need to define porter stemmer above
    # text = [stemmer.stem(word) for word in text.split()]

    # Lemmatize words. Need to define lemmatizer above
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc if len(token.lemma_) > 1 ])
    
    # Remove excesive whitespaces
    text = re.sub("[\s]+", " ", text)    
    
    # Join the words back into one string separated by space, and return the result.
    return text

In [19]:
text = 'On a quest for the perfedc1112t,,, !!!! <br />%%2%% popcorn to compliment the Whirley Pop.  Don\'t get older, I\'m beginning to appreciate the more "natural" popcorn varieties, and I suppose that\'s what attracted me to the Arrowhead Mills Organic Yellow Popcorn.<br /> <br />I\'m no "organic" food expert.  I just wanted some good tasting popcorn.  And, I feel like that\'s what I got.  Using the Whirley Pop, with a very small amount of oil, I\'ve had great results.'

print('Original text', text, '#'*30, sep='\n\n')
print('\nNormalized text', normalize_text(text), sep='\n\n')

Original text

On a quest for the perfedc1112t,,, !!!! <br />%%2%% popcorn to compliment the Whirley Pop.  Don't get older, I'm beginning to appreciate the more "natural" popcorn varieties, and I suppose that's what attracted me to the Arrowhead Mills Organic Yellow Popcorn.<br /> <br />I'm no "organic" food expert.  I just wanted some good tasting popcorn.  And, I feel like that's what I got.  Using the Whirley Pop, with a very small amount of oil, I've had great results.

##############################

Normalized text

quest perfedct popcorn compliment whirley pop not get old begin appreciate natural popcorn variety suppose attract arrowhead mill organic yellow popcorn no organic food expert want good tasting popcorn and feel like get use whirley pop small amount oil great result


In [20]:
# Slicing dataset for demonstrative purposes

df = df.groupby('sentiment').sample(2500, random_state=42)

# Note: without resetting an index we slice over the original Id`s
df.shape

(5000, 10)

In [21]:
df['text_normalized'] = df['Text'].progress_apply(normalize_text)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [22]:
train_idxs = df.sample(frac=0.8, random_state=42).index
test_idxs = [idx for idx in df.index if idx not in train_idxs]

In [23]:
X_train = df.loc[train_idxs, 'text_normalized']
X_test = df.loc[test_idxs, 'text_normalized']

y_train = df.loc[train_idxs, 'sentiment']
y_test = df.loc[test_idxs, 'sentiment']

In [24]:
# Creating and training a CountVectorizer object 

vect = CountVectorizer().fit(X_train)

len(vect.vocabulary_)

12650

In [25]:
# features examples

vect.get_feature_names_out()[:5]

array(['aa', 'ab', 'aback', 'abandon', 'abba'], dtype=object)

In [26]:
# transform the documents in the training data to a document-term matrix

X_train_vectorized = vect.transform(X_train)
X_train_vectorized.shape

(4000, 12650)

In [27]:
# Resulted features representation is a sparse matrix

X_train_vectorized

<4000x12650 sparse matrix of type '<class 'numpy.int64'>'
	with 138253 stored elements in Compressed Sparse Row format>

In [28]:
model = LogisticRegression(random_state=42)
model.fit(X_train_vectorized, y_train)

In [29]:
predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.8371428571428571


In [30]:
def get_preds(text_column, algorithm, ngrams=(1,1)):
    
    X_train = df.loc[train_idxs, text_column]
    X_test = df.loc[test_idxs, text_column]

    y_train = df.loc[train_idxs, 'sentiment']
    y_test = df.loc[test_idxs, 'sentiment']
    
    if algorithm == 'cv':
        vect = CountVectorizer(ngram_range=ngrams).fit(X_train)
    elif algorithm == 'tfidf':
        vect = TfidfVectorizer(ngram_range=ngrams).fit(X_train)
    else:
        raise ValueError('Select correct algorithm: `cv` or `tfidf`')
            
    print('Vocabulary length: ', len(vect.vocabulary_))
    
    # transform the documents in the training data to a document-term matrix

    X_train_vectorized = vect.transform(X_train)
    print('Document-term matrix shape:', X_train_vectorized.shape)
    
    model = LogisticRegression(random_state=42)
    model.fit(X_train_vectorized, y_train)
    
    predictions = model.predict(vect.transform(X_test))

    print('AUC: ', roc_auc_score(y_test, predictions))

In [31]:
get_preds('Text', 'cv')

Vocabulary length:  14214
Document-term matrix shape: (4000, 14214)
AUC:  0.8522305764411028


In [32]:
get_preds('text_normalized', 'tfidf')

Vocabulary length:  12650
Document-term matrix shape: (4000, 12650)
AUC:  0.8466165413533835


In [33]:
get_preds('Text', 'tfidf')

Vocabulary length:  14214
Document-term matrix shape: (4000, 14214)
AUC:  0.8407017543859648


In [34]:
get_preds('text_normalized', 'cv', (1,2))

Vocabulary length:  127773
Document-term matrix shape: (4000, 127773)
AUC:  0.8695739348370928


In [35]:
get_preds('text_normalized', 'tfidf', (1,2))

Vocabulary length:  127773
Document-term matrix shape: (4000, 127773)
AUC:  0.8461654135338346


In [36]:
get_preds('text_normalized', 'cv', (2,2))

Vocabulary length:  115123
Document-term matrix shape: (4000, 115123)
AUC:  0.7773934837092732


In [37]:
get_preds('Text', 'cv', (2,2))

Vocabulary length:  132784
Document-term matrix shape: (4000, 132784)
AUC:  0.8374436090225565


In [38]:
get_preds('Text', 'tfidf', (2,2))

Vocabulary length:  132784
Document-term matrix shape: (4000, 132784)
AUC:  0.8363408521303258
