# B2W-Reviews01

More information: https://github.com/americanas-tech/b2w-reviews01/blob/main/README.md

## Configuration

In [None]:
## If you are using a Folder on Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

folder = ''

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import nltk
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer

nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('portuguese')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from collections import Counter
import string
import json
import math
from wordcloud import WordCloud

In [None]:
from gensim.models import LdaModel
from gensim.models import Phrases
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
from gensim.test.utils import datapath

In [None]:
!python -m spacy download pt_core_news_lg -q

In [None]:
import spacy

nlp = spacy.load("pt_core_news_lg")

In [None]:
df = pd.read_csv(folder + 'B2W-Reviews01TextSentiments.csv')

## Processing user reviews

### Topic Modelling


#### Functions

In [None]:
def process_corpus(docs):
  tokenizer = RegexpTokenizer(r'\w+')
  for idx in range(len(docs)):
      docs[idx] = docs[idx].lower()
      docs[idx] = tokenizer.tokenize(docs[idx])

  docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

  docs = [[token for token in doc if len(token) > 1] for doc in docs]

  bigram = Phrases(docs, min_count=20)
  for idx in range(len(docs)):
      for token in bigram[docs[idx]]:
          if '_' in token:
              docs[idx].append(token)

  dictionary = Dictionary(docs)

  dictionary.filter_extremes(no_below=20, no_above=0.5)

  corpus = [dictionary.doc2bow(doc) for doc in docs]

  return dictionary, corpus, docs

In [None]:
def create_model(corpus, dictionary, num_topics):
  dictionary[0]
  return LdaModel(corpus=corpus, id2word=dictionary.id2token, num_topics=num_topics)

In [None]:
def topics_cloud(topics):
  topics_cloud = { w: v for v, w in topics}

  wordcloud = WordCloud(collocations = False, background_color = 'white')
  wordcloud.generate_from_frequencies(frequencies=topics_cloud)
  plt.figure()
  plt.imshow(wordcloud, interpolation="bilinear")
  plt.axis("off")
  plt.show()

In [None]:
def plot_word_clouds(topics, rows, cols, i = 0, figsize=None, title='', file_name=None):

  figure, axis = plt.subplots(rows, cols, figsize=figsize)

  n_topics = len(topics)

  for r in range(rows):
    for c in range(cols):
      ax = axis
      if rows > 1 and cols > 1:
        ax = axis[r, c]
      elif rows > 1:
        ax = axis[r]
      elif cols > 1:
        ax = axis[c]
      else:
        ax = axis
      ax.axis("off")

      if i >= n_topics:
        continue

      topics_cloud = { w: v for v, w in topics[i][0]}
      if file_name:
        wordcloud = WordCloud(width=2000, height=1000, collocations = False, background_color = 'white')
      else:
        wordcloud = WordCloud(collocations = False, background_color = 'white')
      wordcloud.generate_from_frequencies(frequencies=topics_cloud)

      ax.imshow(wordcloud, interpolation="bilinear")
      ax.set_title("Tópico %d" % i)
      ax.axis("off")

      i += 1

  figure.suptitle(('Tópicos %s' % title).strip())

  if file_name:
    plt.savefig(file_name, dpi=600, bbox_inches='tight', pad_inches = 0)

  plt.show()

#### Fitting Model

In [None]:
df = df[df.review_lemmas.isna() == False].copy()
docs = list(df.review_lemmas)

dictionary, corpus, docs = process_corpus(docs)

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
models = []
for n in range(2, 21, 2):
  model = create_model(corpus, dictionary, n)
  coherence = CoherenceModel(model=model, corpus=corpus, coherence='c_v', texts=docs, dictionary=dictionary).get_coherence()
  perplexity = model.log_perplexity(corpus)
  models.append((n, model, coherence, perplexity))
  print('Average topic coherence: %.4f. Perplexity: %.4f. Number of topics: %d' % (coherence, perplexity, n))

In [None]:
pd.DataFrame([(n,c,p) for n,_,c,p in models], columns=['Number of topics', 'Coherence', 'Perplexity'])

In [None]:
p_y = [ p for _,_,_,p in models]
c_y = [ c for _,_,c,_ in models]
n = [ n for n,_,_,_ in models]

In [None]:
fig, ax = plt.subplots(1,2, figsize=(10,3))

ax[0].plot(n, c_y)
ax[0].set_title('Coherence')
ax[0].set_xlabel('Number of Topics')
ax[0].set_ylabel('Score')

ax[1].plot(n, p_y)
ax[1].set_title('Perplexity')
ax[1].set_xlabel('Number of Topics')
ax[1].set_ylabel('Score')

plt.show()

In [None]:
selected_model = models[4]
model = selected_model[1]
coherence = selected_model[2]
perplexity = selected_model[3]
print('Average topic coherence: %.4f. Perplexity: %.4f. Number of topics: %d' % (coherence, perplexity, selected_model[0]))

In [None]:
top_topics = model.top_topics(corpus)

In [None]:
plot_word_clouds(top_topics, 2, 5, 0, (15, 4), 'Geral', 'topicscloud.png')

### Sentiment Analysis

#### Functions

In [None]:
def word_sentiment(word, df, index=None):
  word_sent_df = df.loc[index] if index else df
  word_sent_df = word_sent_df[[True if word in r else False for r in word_sent_df.review_lemmas]]

  leng = len(word_sent_df)

  if leng == 0:
    return 0, 0, 0

  pos = word_sent_df[word_sent_df.review_sent_label == 'Positive'].review_sent_score.sum() / leng
  neg = word_sent_df[word_sent_df.review_sent_label == 'Negative'].review_sent_score.sum() / leng
  neu = word_sent_df[word_sent_df.review_sent_label == 'Neutral'].review_sent_score.sum() / leng

  return pos, neg, neu

In [None]:
def topic_sentiment(words_sentiment, top_topics, sentiment='positive'):
  topics_sentiments = []
  for t in top_topics:
    sent = 0
    for w in t[0]:
      if w[1] not in words_sentiment.keys():
        continue
      if sentiment == 'overall':
        sent += words_sentiment[w[1]]['positive'] - words_sentiment[w[1]]['negative']
      else:
        sent += words_sentiment[w[1]][sentiment]
    topics_sentiments.append(sent / len(t[0]))

  return topics_sentiments

In [None]:
def word_qtd_sentiment(word, df, index=None):
  df = df.loc[index] if index else df
  word_sent_df = df[[True if word in r else False for r in df.review_lemmas]]
  word_sent_df = word_sent_df.groupby('review_sent_label')['review_sent_label'].count()

  sentiments = [0, 0, 0]
  for s in word_sent_df.index:

    if s == 'Positive':
      sentiments[0] = word_sent_df.loc['Positive']
    elif s == 'Negative':
      sentiments[1] = word_sent_df.loc['Negative']
    elif s == 'Neutral':
      sentiments[2] = word_sent_df.loc['Neutral']

  return sentiments

In [None]:
def topic_qtd_sentiment(words_sentiment, top_topics, sentiment='positive'):
  topics_sentiments = []
  for t in top_topics:
    sent = 0
    for w in t[0]:
      if w[1] not in words_sentiment.keys():
        continue
      if sentiment == 'overall':
        sent += words_sentiment[w[1]]['positive'] + words_sentiment[w[1]]['negative'] + words_sentiment[w[1]]['neutral']
      else:
        sent += words_sentiment[w[1]][sentiment]
    topics_sentiments.append(sent)

  return topics_sentiments

#### Applying Sentiment Analysis

In [None]:
words_sentiment = {}
dictionary[0]
for _,w in dictionary.id2token.items():
  pos, neg, neu = word_sentiment(w, df)
  words_sentiment[w] = { 'positive': pos, 'negative': neg, 'neutral': neu }

In [None]:
plt.bar(range(10), topic_sentiment(words_sentiment, top_topics, sentiment='positive'))
plt.show()

In [None]:
plt.bar(range(10), topic_sentiment(words_sentiment, top_topics, sentiment='negative'))
plt.show()

In [None]:
plt.bar(range(10), topic_sentiment(words_sentiment, top_topics, sentiment='overall'))
plt.show()

In [None]:
pd.DataFrame(topic_sentiment(words_sentiment, top_topics, sentiment='overall'), columns=['Overall Sentiment'])

In [None]:
words_qtd_sentiment = {}
dictionary[0]
for _,w in dictionary.id2token.items():
  pos, neg, neu = word_qtd_sentiment(w, df)
  words_qtd_sentiment[w] = { 'positive': pos, 'negative': neg, 'neutral': neu }

In [None]:
width = 0.25
multiplier = 0
offset = width * multiplier
plt.bar(np.arange(10) + offset, topic_qtd_sentiment(words_qtd_sentiment, top_topics, sentiment='positive'), width, label='Positive', color=['tab:green'])

multiplier += 1
offset = width * multiplier
plt.bar(np.arange(10) + offset, topic_qtd_sentiment(words_qtd_sentiment, top_topics, sentiment='negative'), width, label='Negative', color=['tab:red'])

multiplier += 1
offset = width * multiplier
plt.bar(np.arange(10) + offset, topic_qtd_sentiment(words_qtd_sentiment, top_topics, sentiment='neutral'), width, label='Neutral', color=['tab:blue'])

plt.legend(loc='upper left', ncols=3)

plt.show()

In [None]:
topic_qtd_sentiment_dict =  {
    'Positive': topic_qtd_sentiment(words_qtd_sentiment, top_topics, sentiment='positive'),
    'Negative': topic_qtd_sentiment(words_qtd_sentiment, top_topics, sentiment='negative'),
    'Neutral': topic_qtd_sentiment(words_qtd_sentiment, top_topics, sentiment='neutral')
}
pd.DataFrame(topic_qtd_sentiment_dict)

## End