In [2]:
import re
import string
import time
import unicodedata
from collections import Counter

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.core.interactiveshell import InteractiveShell
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import STOPWORDS, WordCloud

InteractiveShell.ast_node_interactivity =  'all'
nltk.download('stopwords')
nltk.download('punkt')

from deep_translator import GoogleTranslator
from textblob import TextBlob

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Transforming the reviews data by removing stopwords, using regular expressions module to accept only letters, 
# making all the words lower case for consistency and joining them into a 'comments' list.
# We'll call this function from tokenize_comments

def preprocess_comments(text):
    text = str(text)
    comments = []
    stop_words = set(stopwords.words('portuguese'))
    
    only_letters = re.sub(r'[^a-zA-ZÀ-ÿ]', " ", text)
    lower_case = only_letters.lower()
    filtered_result = ' '.join([l for l in lower_case.split() if l not in stop_words]) 
    comments.append(filtered_result)

    return comments     

In [4]:
# Tokenizing comments to make easier their preprocessing

def tokenize_comments(comment):
    comment = str(comment)    

    tokens = word_tokenize(comment, language= 'portuguese')
    clean_tokens = preprocess_comments(tokens)
    while("" in clean_tokens):
        clean_tokens.remove("")
    clean_tokens = ' '.join([str(elem) for elem in clean_tokens])
  
    return clean_tokens
  

In [5]:
# English translation for TextBlob

def translate(text):
    text = str(text)
    translated = GoogleTranslator(source='auto', target='en').translate(text)
    return translated

In [6]:
# Polarity is the measure of the overall combination of the positive and negative emotions in a sentence.
# For TextBlob, Polarity is the output that lies between [-1,1], where -1 refers to negative sentiment and +1 refers to positive sentiment

def getPolarity(text):
    text = str(text)
    analysis = TextBlob(text)
    if not pd.isna(text): 
        if text == 'boa' or text =='bom' or text =='recomendo':
            result = 0.4
            return result
        elif text != '':    
            result = analysis.sentiment.polarity
            return result           
    else: return 100

In [7]:
# We measure and assign the sentiment class using the polarity value

def getSentimentClass(polarity):   
    try: 
        if polarity >= 0.3 and polarity <=1:
            return 'positive'
        elif polarity >= 0 and polarity < 0.3:
            return 'neutral'
        elif polarity < 0:
            return 'negative'        
        else: return 0
    except: return 'neutral'

In [8]:
# If both of the two options available for NLP in this dataset (review title and review message) are NaN values,
# we will use the reviewScore to get the sentiment class

def reviewScore (score):
    if score >= 4:
        result = 'positive'
        return result
    elif score >= 3:
        result = 'neutral'
        return result
    elif score <= 2:
        result = 'negative'
        return result

In [130]:
# Function for dataframes

def sentimentAnalysis (row):
    if row['review_comment_message'] != np.NaN:
        if row['review_comment_message'] != '':
            token = tokenize_comments(row['review_comment_message'])
            trans = translate(token)
            polar = getPolarity(trans)
            clas = getSentimentClass(polar)
            #print("Class:", clas)
        return clas
    elif row['review_comment_title'] != np.NaN:
        if row['review_comment_title'] != '':
            token = tokenize_comments(row['review_comment_title'])
            trans = translate(token)
            polar = getPolarity(trans)
            clas = getSentimentClass(polar)
            #print("Class:", clas)
        return clas
    elif row['review_score'] != np.NaN:
        clas = reviewScore(row['review_score'])
        #print("Class:", clas)
        return clas
    else: return 'neutral'

#review_df['sentiment_class'] = review_df.apply(sentimentAnalysis, axis=1)
    


In [9]:
review_df = pd.read_csv('./data/olist_order_reviews_dataset.csv')
review_df.head(5)

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [131]:
review_df['sentiment_class'] = review_df.apply(sentimentAnalysis, axis=1)

Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: negative
Class: neutral
Class: neutral
Class: negative
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: positive
Class: neutral
Class: neutral
Class: negative
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: positive
Class: neutral
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: neutral
Class: positive
Class: neutral
Class: negative
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: neutral
Class: neutral
Class: neutral
Class: positive
Class: neutral
Class: positive
Class: positive
Class: neutral
Class: positive
Class: 

In [11]:
# Function for input

def getSentimentAnalysis (input):
    if input != np.NaN:
        if input != '':
            token = tokenize_comments(input)
            trans = translate(token)
            polar = getPolarity(trans)
            clas = getSentimentClass(polar)
            return clas

input = 'péssimo'
print(getSentimentAnalysis(input))

negative
