# NLP Analysis of reviews
- Calculate sentiment and compare against the actual rating they user gave

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

import os
import sys
sys.path.append(os.path.abspath("/Users/lucialarraona/Desktop/finalproject_socialgraphs22/Notebooks"))

from helper_functions import  *

import nltk 
from nltk.stem import PorterStemmer # para ver la raiz de las palabras(worker = work)
from nltk.tokenize import sent_tokenize, word_tokenize # para tokenizar las frases y las palabras 
import re # regex
import string

nltk.download('omw-1.4') #needed for lemmatizing

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer # an nltk package for sentiment analysis :D

In [7]:
# With the reviews dataset, map the community for each user. 

df_reviews = decompress_pickle('../Data/reviews_zipped.pbz2') 
print(df_reviews.shape)
df_reviews.head()

(1132367, 5)


Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [6]:
df_com_map = pd.read_csv('user_community_map.csv')
print(df_com_map.shape)
df_com_map.head()

(15257, 2)


Unnamed: 0,user_id,community
0,1535,7
1,3288,2
2,4439,7
3,4470,7
4,4769,35


In [10]:
df_reviews2 = pd.merge(df_com_map, df_reviews, how="left", on=["user_id"]) # merging left is using only the keys on the left df, in this case df_com_map
print(df_reviews2.shape)
df_reviews2.head()

(144295, 6)


Unnamed: 0,user_id,community,recipe_id,date,rating,review
0,1535,7,349022,2010-02-11,4,"After being w/o power all day, it came back on..."
1,1535,7,50022,2004-09-09,5,This is an excellent recipe. Even my not-a-big...
2,1535,7,78834,2005-02-02,5,Made these for my kids one day as a surprise a...
3,1535,7,47474,2004-08-04,5,Love them!! A quick & easy recipe with ingredi...
4,1535,7,230720,2007-07-02,5,I was a little afraid this might not work very...


## Clean text from reviews

In [21]:
#///////// TODO Clean text, remove punctuationn, stopwords, and lowercase all. 
#///////// TODO Tokenize and lemmatize 

def text_cleaning(text):
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

stopwords = nltk.corpus.stopwords.words('english')

ps = PorterStemmer()
wn = nltk.WordNetLemmatizer()

def clean_stem (review): 

    temp1 ="".join(x for x in review if x not in string.punctuation)
    temp2 = re.split('\W+',temp1.lower())
    temp3 = [ps.stem(x) for x in temp2 if x not in stopwords]
    return temp3


def clean_lemma (review):
    
    temp1 ="".join(x for x in review if x not in string.punctuation)
    temp2 = re.split('\W+',temp1.lower())
    temp3 = [wn.lemmatize(x) for x in temp2 if x not in stopwords]
    return temp3


df_reviews2['review'] =df_reviews2['review'] .astype('str')  # object to string
df_reviews2['review'] = df_reviews2['review'] .apply(text_cleaning) # cleaning 
df_reviews2['review_clean']= df_reviews2['review'].map(clean_lemma) # apply lemmatization for now



In [23]:
df_reviews2.head()

Unnamed: 0,user_id,community,recipe_id,date,rating,review,review_clean
0,1535,7,349022,2010-02-11,4,after being w o power all day it came back on...,"[w, power, day, came, back, right, bedtime, qu..."
1,1535,7,50022,2004-09-09,5,this is an excellent recipe even my not a big...,"[excellent, recipe, even, big, fan, banana, br..."
2,1535,7,78834,2005-02-02,5,made these for my kids one day as a surprise a...,"[made, kid, one, day, surprise, thought, reall..."
3,1535,7,47474,2004-08-04,5,love them a quick easy recipe with ingredi...,"[love, quick, easy, recipe, ingredient, always..."
4,1535,7,230720,2007-07-02,5,i was a little afraid this might not work very...,"[little, afraid, might, work, well, turned, lo..."


In [20]:
#//// Sanity check of differneces between lemmatizing and stemming
print(df_reviews2['review'][2])
print(clean_lemma(df_reviews2['review'][2]))
print()
print(df_reviews2['review'][2])
print(clean_stem(df_reviews2['review'][2]))

made these for my kids one day as a surprise and they thought they were really neat  being a perfectionist  i did have to make  legs but that s just me  an easy way to add some fun to lunch 
['made', 'kid', 'one', 'day', 'surprise', 'thought', 'really', 'neat', 'perfectionist', 'make', 'leg', 'easy', 'way', 'add', 'fun', 'lunch', '']

made these for my kids one day as a surprise and they thought they were really neat  being a perfectionist  i did have to make  legs but that s just me  an easy way to add some fun to lunch 
['made', 'kid', 'one', 'day', 'surpris', 'thought', 'realli', 'neat', 'perfectionist', 'make', 'leg', 'easi', 'way', 'add', 'fun', 'lunch', '']


In [28]:
#/////// TODO Create sentiment calculator function 
sid = SentimentIntensityAnalyzer()


# Sentiment compound value of reviews
def sentiment(x):
    score = sid.polarity_scores(x)
    return score['compound']


#  Add Positive y Negative tag 
def cls(x):
    list1=[]
    for i in x["score"]:
        if i>0:
            list1.append("Positive")
        elif i==0:
            list1.append("Neutral")
        else:
            list1.append("Negative")
    x["sentiment"]=list1



df_reviews2['score']= df_reviews2['review'].apply(lambda x : sentiment(x)) #score
cls(df_reviews2) # tag (for plotting)
df_reviews2.head() 


Unnamed: 0,user_id,community,recipe_id,date,rating,review,review_clean,score,sentiment
0,1535,7,349022,2010-02-11,4,after being w o power all day it came back on...,"[w, power, day, came, back, right, bedtime, qu...",0.8481,Positive
1,1535,7,50022,2004-09-09,5,this is an excellent recipe even my not a big...,"[excellent, recipe, even, big, fan, banana, br...",0.9707,Positive
2,1535,7,78834,2005-02-02,5,made these for my kids one day as a surprise a...,"[made, kid, one, day, surprise, thought, reall...",0.9166,Positive
3,1535,7,47474,2004-08-04,5,love them a quick easy recipe with ingredi...,"[love, quick, easy, recipe, ingredient, always...",0.6486,Positive
4,1535,7,230720,2007-07-02,5,i was a little afraid this might not work very...,"[little, afraid, might, work, well, turned, lo...",0.9722,Positive


In [25]:
# Sentiment compound value of reviews
def sentiment(x):
    score = sid.polarity_scores(x)
    return score['compound']

df_reviews2['score']= df_reviews2['review'].apply(lambda x : sentiment(x))
df_reviews2.head()

Unnamed: 0,user_id,community,recipe_id,date,rating,review,review_clean,score
0,1535,7,349022,2010-02-11,4,after being w o power all day it came back on...,"[w, power, day, came, back, right, bedtime, qu...",0.8481
1,1535,7,50022,2004-09-09,5,this is an excellent recipe even my not a big...,"[excellent, recipe, even, big, fan, banana, br...",0.9707
2,1535,7,78834,2005-02-02,5,made these for my kids one day as a surprise a...,"[made, kid, one, day, surprise, thought, reall...",0.9166
3,1535,7,47474,2004-08-04,5,love them a quick easy recipe with ingredi...,"[love, quick, easy, recipe, ingredient, always...",0.6486
4,1535,7,230720,2007-07-02,5,i was a little afraid this might not work very...,"[little, afraid, might, work, well, turned, lo...",0.9722


In [26]:
#  Add Positive y Negative tag 
def cls(x):
    list1=[]
    for i in x["score"]:
        if i>0:
            list1.append("Positive")
        elif i==0:
            list1.append("Neutral")
        else:
            list1.append("Negative")
    x["sentiment"]=list1

cls(df_reviews2)
df_reviews2.head()

Unnamed: 0,user_id,community,recipe_id,date,rating,review,review_clean,score,sentiment
0,1535,7,349022,2010-02-11,4,after being w o power all day it came back on...,"[w, power, day, came, back, right, bedtime, qu...",0.8481,Positive
1,1535,7,50022,2004-09-09,5,this is an excellent recipe even my not a big...,"[excellent, recipe, even, big, fan, banana, br...",0.9707,Positive
2,1535,7,78834,2005-02-02,5,made these for my kids one day as a surprise a...,"[made, kid, one, day, surprise, thought, reall...",0.9166,Positive
3,1535,7,47474,2004-08-04,5,love them a quick easy recipe with ingredi...,"[love, quick, easy, recipe, ingredient, always...",0.6486,Positive
4,1535,7,230720,2007-07-02,5,i was a little afraid this might not work very...,"[little, afraid, might, work, well, turned, lo...",0.9722,Positive


In [27]:
import plotly as ply
import plotly.graph_objs as go

Mno=df_reviews2[df_reviews2.sentiment=="Positive"]["score"].count()
Fno=df_reviews2[df_reviews2.sentiment=="Negative"]["score"].count()
Nno=df_reviews2[df_reviews2.sentiment=="Neutral"]["score"].count()
labels = ["Positive Comments","Negative Comments","Neutral Comments"]
values = [Mno,Fno,Nno]
fig = go.Figure(data=[go.Pie(labels=labels, values=values,hole=0.4)])
fig.show()

# Group by community 


In [43]:
#////// TODO create graph with average recipe rating and review text sentiment per community

df_mean_rating= pd.DataFrame(df_reviews2.groupby(by=["community"])['rating','score'].mean())
df_mean_rating


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,rating,score
community,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3.472727,0.629917
1,4.432631,0.788757
2,4.312182,0.713220
3,4.044586,0.680864
4,4.575132,0.530127
...,...,...
65,4.400000,0.612933
66,2.142857,0.297614
67,5.000000,0.713533
68,3.880952,0.746650


In [44]:
# to access multiindex dataframes
#com0 = df_mean_rating.loc[0, :]
#com0.plot(kind='bar')