In [3]:
import pandas as pd
import numpy as np
# from tqdm import tqdm
import os
import nltk
from nltk.corpus import stopwords

In [4]:
# mounting GDrive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
os.chdir('gdrive/My Drive/Capstone Dataset')
data = pd.read_csv('Reviews.csv')

In [None]:
print('The shape of data is {}'.format(data.shape))
data.head()

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [7]:
# dropping duplicates
df = data.drop_duplicates(subset=['UserId','ProfileName','Time','Text'],keep='first',inplace=False)
df.shape

(393933, 10)

In [16]:
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               11
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                    3
Text                       0
dtype: int64

In [None]:
df.head()

In [None]:
# inspecting data with null values
# creating bool series True for NaN values 
bool_series = pd.isnull(df["ProfileName"]) 
    
# filtering data 
# displaying data only with Gender = NaN 
df[bool_series] 

In [8]:
# filling null values for profileName
df["ProfileName"].fillna("Anonymous", inplace=True)
#filling null values for summary
df["Summary"].fillna("Unspecified", inplace=True)

print(df.isnull().sum())

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [28]:
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
score = df['Score'].apply(lambda x: -1 if x<3 else (1 if x>3 else 0))
newdf = df.copy()
newdf['Sentiment'] = score
newdf.head(3)


# plt.figure(figsize=(8,6))
# sns.countplot(newdf['Sentiment'])
# plt.title('Target')
# plt.show()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,-1
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1


In [10]:
# Preprocessing text data
# some of the reviews have html tags, stopwords

from bs4 import BeautifulSoup
import re
import string
from tqdm import tqdm

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
import nltk
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))
print(stops)

{'between', 'all', 'm', 'myself', 'her', 'both', 'there', 'why', 'will', "shouldn't", 'o', 'it', 'the', 'now', 'didn', "doesn't", 'our', 'when', 'll', 'such', 'itself', 'who', 'yourself', 'very', 'here', "it's", 'where', 'how', "won't", 'yourselves', 'which', 'then', 'if', 'their', 'for', 'and', 'you', 'but', 'y', 'through', 'is', 'theirs', 'am', "weren't", 'i', 'them', 'from', 'she', "mustn't", 'weren', 'don', "you're", 'same', 'during', 't', 'mustn', 'some', 'these', 'as', 'shan', 'aren', 'mightn', 'its', 'herself', 'over', 'hadn', 'nor', "you'd", 's', 'after', 're', 'haven', "hadn't", "don't", 'few', "aren't", 'this', "needn't", 'until', 'in', "that'll", 'any', 'below', 'isn', 'against', "you'll", 'wasn', "shan't", 'yours', 'a', "didn't", 'too', 'd', 'by', 'doing', 'your', "hasn't", 'out', 'not', 'so', 'once', 'were', 'down', 'doesn', 'on', 'themselves', 'won', 'shouldn', 'that', "you've", 'other', "should've", 'should', 'most', 'had', 'to', 'be', 'more', 'up', 'those', 'while', 'ai

In [14]:
# decontracting words in english so it have better meaning
def decontract(phrase):
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [15]:
def clean_text(column, stopwords):
  preprocessed_reviews = []
# tqdm is for printing the status bar
  for sentence in tqdm(column.values):
    sentence = re.sub(r"http\S+", "", sentence)
    # removing html tags
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontract(sentence)
    # removing extra spaces and numbers
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    # removing non alphabels
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    # https://gist.github.com/sebleier/554280
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(sentence.strip())
  return preprocessed_reviews

In [16]:
newdf['raw_text'] = newdf['Text']

In [17]:
preprocessed_reviews = clean_text(newdf['raw_text'], stops)

100%|██████████| 393933/393933 [02:50<00:00, 2304.42it/s]


In [18]:
newdf['clean_text'] = preprocessed_reviews
newdf.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment,raw_text,clean_text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,-1,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1,This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,-1,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,1,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


In [19]:
# take only relevant features

cleaned_df = newdf[["UserId", "clean_text", "Score", "Sentiment"]]
cleaned_df.head(20)

Unnamed: 0,UserId,clean_text,Score,Sentiment
0,A3SGXH7AUHU8GW,bought several vitality canned dog food produc...,5,1
1,A1D87F6ZCVE5NK,product arrived labeled jumbo salted peanuts p...,1,-1
2,ABXLMWJIXXAIN,confection around centuries light pillowy citr...,4,1
3,A395BORC6FGVXV,looking secret ingredient robitussin believe f...,2,-1
4,A1UQRSCLF8GW1T,great taffy great price wide assortment yummy ...,5,1
5,ADT0SRK1MGOEU,got wild hair taffy ordered five pound bag taf...,4,1
6,A1SP2KVKFXXRU1,saltwater taffy great flavors soft chewy candy...,5,1
7,A3JRGQVEQN31IQ,taffy good soft chewy flavors amazing would de...,5,1
8,A1MZYO9TZK0BBI,right mostly sprouting cats eat grass love rot...,5,1
9,A21BT40VZCCYT4,healthy dog food good digestion also good smal...,5,1


In [20]:
#reseting index
cleaned_df = cleaned_df.reset_index(drop=True)
#saving processed file
cleaned_df.to_csv('processed_review_3label.csv',index=False)

In [21]:
# if we don't consider neutral reviews
dfno3 = df[df['Score'] != 3]
score2 = dfno3['Score'].apply(lambda x: 1 if x>3 else 0)
newdf2 = dfno3.copy()
newdf2['Sentiment'] = score2
newdf2.tail(10)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment
568444,568445,B001EO7N10,A2SD7TY3IOX69B,"BayBay ""BayBay Knows Best""",3,3,5,1245369600,Best Value for Chinese 5 Spice,"As a foodie, I use a lot of Chinese 5 Spice po...",1
568445,568446,B001EO7N10,A2E5C8TTAED4CQ,S. Linkletter,2,2,5,1268006400,Five Spice Powder,"You can make this mix yourself, but the Star A...",1
568446,568447,B001EO7N10,A2P9W8T7NTLG2Z,Andy,0,0,2,1328918400,Mixed wrong,I had ordered some of these a few months back ...,0
568447,568448,B001EO7N10,APWCOAVILK94B,"Real Named Person ""wowzee""",0,0,5,1322524800,"If its all natural, this is like panacea of Sp...","Hoping there is no MSG in this, this tastes ex...",1
568448,568449,B001EO7N10,A1F6BHEYB7R6R7,James Braley,0,0,5,1308096000,Very large ground spice jars.,My only complaint is that there's so much of i...,1
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...,1
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...,0
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o...",1
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...,1
568453,568454,B001LR2CU2,A3LGQPJCZVL9UC,srfell17,0,0,5,1338422400,Great Honey,"I am very satisfied ,product is as advertised,...",1


In [22]:
newdf2['raw_text'] = newdf2['Text']
cleaned2 = clean_text(newdf2['raw_text'], stops)

100%|██████████| 364164/364164 [02:35<00:00, 2339.52it/s]


In [23]:
newdf2['clean_text'] = cleaned2
newdf2.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment,raw_text,clean_text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1,This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,0,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,1,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


In [24]:
cleaned_df2 = newdf2[["UserId", "clean_text", "Score", "Sentiment"]]
#reseting index
cleaned_df2 = cleaned_df2.reset_index(drop=True)
#saving processed file
cleaned_df2.to_csv('processed_review_binary.csv',index=False)