In [1]:
import pandas as pd

In [31]:
file_dataset = "df_processed.csv"
file_out = "df_processed_polarity_and_satisfaction.csv"

In [3]:
dataset = pd.read_csv(file_dataset)

# POLARITY OF THE REVIEWS

In [4]:
#VADER for positive/negative sentiment analysis
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mjubuntu18/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [5]:
#Reviews
reviews = dataset[['Id','Text']]

In [30]:
reviews.head()

Unnamed: 0,Id,Text
0,1,I have bought several of the Vitality canned d...
1,3,This is a confection that has been around a fe...
2,4,If you are looking for the secret ingredient i...
3,2,Product arrived labeled as Jumbo Salted Peanut...
4,5,Great taffy at a great price. There was a wid...


In [6]:
#Vader Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

In [7]:
#Store the polarity of the reviews
reviews_polarity = []

In [8]:
%%time
#Store the compound polarity for each review
#Note: this takes around 12 minutes (Google Colab)
for index in range(len(reviews)):
  review = reviews.loc[index, 'Text']
  scores = analyzer.polarity_scores(review)
  compound_score = scores['compound']
  reviews_polarity.append(compound_score)

CPU times: user 5min 55s, sys: 156 ms, total: 5min 55s
Wall time: 5min 55s


In [9]:
#Check that all reviews have a polarity
print("Calculated polarity of", len(reviews_polarity), "reviews")

Calculated polarity of 568377 reviews


In [10]:
#Turn array into dataframe
polarity_dataframe = pd.DataFrame(reviews_polarity, columns = ['Polarity'])

#Add review Ids to the dataframe
polarity_dataframe = pd.concat([reviews["Id"], polarity_dataframe], axis=1)

#Add the polarity to the original dataframe
dataframe_with_polarity = pd.concat([dataset, polarity_dataframe["Polarity"]], axis=1)

In [11]:
polarity_dataframe

Unnamed: 0,Id,Polarity
0,1,0.9441
1,3,0.8265
2,4,0.0000
3,2,-0.5664
4,5,0.9468
...,...,...
568372,568452,0.4352
568373,568451,-0.4848
568374,568453,0.9717
568375,568454,0.4754


In [12]:
polarity_dataframe.head()

Unnamed: 0,Id,Polarity
0,1,0.9441
1,3,0.8265
2,4,0.0
3,2,-0.5664
4,5,0.9468


In [13]:
dataframe_with_polarity.drop("Unnamed: 0",axis=1,inplace=True)

In [14]:
dataframe_with_polarity.head()

Unnamed: 0,_id,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL,Year,Month,Day,Polarity
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,https://www.amazon.com/dp/B001E4KFG0,2011,4,27,0.9441
1,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,https://www.amazon.com/dp/B000LQOCH0,2008,8,18,0.8265
2,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,https://www.amazon.com/dp/B000UA0QIQ,2011,6,13,0.0
3,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,https://www.amazon.com/dp/B00813GRG4,2012,9,7,-0.5664
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,https://www.amazon.com/dp/B006K2ZZ7K,2012,10,21,0.9468


In [15]:
#Save the polarity onto a csv file
# polarity_dataframe.to_csv('polarity.csv', index=False)  # not needed for now

#Save the whole dataframe with polarity to a csv file
# dataframe_with_polarity.to_csv(file_out, index=False)

## small dataframe resume


In [16]:
polarity_and_score_df = pd.concat([polarity_dataframe, dataset["Score"]], axis=1)

In [17]:
polarity_and_score_df

Unnamed: 0,Id,Polarity,Score
0,1,0.9441,5
1,3,0.8265,4
2,4,0.0000,2
3,2,-0.5664,1
4,5,0.9468,5
...,...,...,...
568372,568452,0.4352,5
568373,568451,-0.4848,2
568374,568453,0.9717,5
568375,568454,0.4754,5


In [18]:
%%time
customer_satisfied = []

for ind in range(len(polarity_and_score_df)):
  polarity = polarity_and_score_df.loc[ind, 'Polarity']
  score = polarity_and_score_df.loc[ind, 'Score']

  percentage_polarity = ((polarity - (-1)) * 100) / (1 - (-1))
  percentage_score = ((score - (1)) * 100) / (5 - (1))

  average_percentage = (percentage_polarity + percentage_score)/200

  if (average_percentage >= 0.61):
    customer_satisfied.append(1)
  else:
    customer_satisfied.append(0)

CPU times: user 9.19 s, sys: 43.9 ms, total: 9.24 s
Wall time: 9.27 s


In [19]:
# #Create a dataframe with the satisfaction scores
# customer_satisfied = pd.DataFrame(customer_satisfied, columns = ['Customer Satisfied'])

# #Add the review Id
# customer_satisfied = pd.concat([polarity_and_score_df["Id"], satisfied_dataframe], axis=1)

# #Save onto a csv file
# # satisfied_dataframe.to_csv('satisfaction.csv', index=False)

In [20]:
#Create a dataframe with the satisfaction scores
satisfied_dataframe = pd.DataFrame(customer_satisfied, columns = ['Customer_Satisfied'])

#Add the review Id
satisfied_dataframe = pd.concat([polarity_and_score_df["Id"], satisfied_dataframe], axis=1)

#Save onto a csv file
# satisfied_dataframe.to_csv('satisfaction.csv', index=False)

In [22]:
# dataframe_with_polarity.drop("Unnamed: 0",axis=1,inplace=True)

In [23]:
satisfied_dataframe.head()

Unnamed: 0,Id,Customer_Satisfied
0,1,1
1,3,1
2,4,0
3,2,0
4,5,1


In [24]:
#Add the customer satisfaction to the original dataframe with polarity
dataframe_with_polarity_and_satisfaction = pd.concat([dataframe_with_polarity, satisfied_dataframe["Customer_Satisfied"]], axis=1)

#Save onto a csv file
# dataframe_with_polarity_and_satisfaction.to_csv('dataframe_with_polarity_and_satisfaction.csv', index=False)

In [25]:
dataframe_with_polarity_and_satisfaction.head()

Unnamed: 0,_id,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL,Year,Month,Day,Polarity,Customer_Satisfied
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,https://www.amazon.com/dp/B001E4KFG0,2011,4,27,0.9441,1
1,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,https://www.amazon.com/dp/B000LQOCH0,2008,8,18,0.8265,1
2,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,https://www.amazon.com/dp/B000UA0QIQ,2011,6,13,0.0,0
3,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,https://www.amazon.com/dp/B00813GRG4,2012,9,7,-0.5664,0
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,https://www.amazon.com/dp/B006K2ZZ7K,2012,10,21,0.9468,1


In [33]:
# #Save onto a csv file
dataframe_with_polarity_and_satisfaction.to_csv(file_out, index=False)

In [34]:
dataframe_with_polarity_and_satisfaction.head()

Unnamed: 0,_id,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,ProductURL,Year,Month,Day,Polarity,Customer_Satisfied
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,2011-04-27,Good Quality Dog Food,I have bought several of the Vitality canned d...,https://www.amazon.com/dp/B001E4KFG0,2011,4,27,0.9441,1
1,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,2008-08-18,"""Delight"" says it all",This is a confection that has been around a fe...,https://www.amazon.com/dp/B000LQOCH0,2008,8,18,0.8265,1
2,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,2011-06-13,Cough Medicine,If you are looking for the secret ingredient i...,https://www.amazon.com/dp/B000UA0QIQ,2011,6,13,0.0,0
3,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,2012-09-07,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,https://www.amazon.com/dp/B00813GRG4,2012,9,7,-0.5664,0
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,2012-10-21,Great taffy,Great taffy at a great price. There was a wid...,https://www.amazon.com/dp/B006K2ZZ7K,2012,10,21,0.9468,1
