<a href="https://colab.research.google.com/github/mariabandeira/Projeto_Final_IIA/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Web scrapping Amazon reviews

In [None]:
# fontes:
# https://www.kaggle.com/code/bhardwajshivam121/web-scrapping-amazon
# https://www.kaggle.com/code/nikhilraj7700/web-scrapping-code-python-beautifulsoup
# perplexity

import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL da página da Amazon
url = "https://www.amazon.com.br/dp/B09SWTG9GF?ref=ods_erd_dpcc_ttl_k11_rc_nd_ucc"

# Realizar uma solicitação HTTP para a página
response = requests.get(url)

# Verificar se a solicitação foi bem-sucedida
if response.status_code == 200:
    # Parsear o conteúdo HTML da página com BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    names = (soup.find_all('span',class_='a-profile-name'))

    cust_name = []
    for i in range(0,len(names)):
      #get_text() removes all the tags and extract text
      cust_name.append(names[i].get_text())

    title = list(soup.find_all(class_='review-title-content'))

    review_title = []
    for i in range(0,len(title)):
      review_title.append(title[i].get_text())

    #to remove '\n'from list of comment heading
    review_title[:] = [review_titles.rstrip('\n')
                for review_titles in review_title] #remove \n

    rating = list(soup.find_all(class_ = 'review-rating'))

    rate = []
    for i in range(0,len(rating)):
      rate.append(rating[i].get_text())

    rate = []
    for i in range(0,len(rating)):
      rate.append(rating[i].get_text()[0:1])

    new_rate = rate[2:12]

    df=pd.DataFrame()
    df['Customer Name'] = cust_name
    df['Review Title'] = review_title
    df['Rating'] = new_rate
    df

    # Criar um dicionário para armazenar as informações das reviews
    reviews_data = []

    # Encontrar a seção com as reviews
    reviews_section = soup.find('div', {'id': 'reviews'})

    # Encontrar as reviews individuais
    reviews = soup.find_all('div', {'data-hook': 'review'})
    for item in reviews:
        review={'body':item.find('span', {'data-hook': 'review-body'}).text.strip()}
        reviews_data.append(review)

    '''
    # Iterar sobre as reviews e extrair as informações
    for review in reviews:
        # Encontrar o título da review
        title = review.find('span', {'data-hook': 'review-title'}).text

        # Encontrar a nota da review
        rating = review.find('i', {'data-hook': 'review-rating'}).text

        # Encontrar o texto da review
        text = review.find('span', {'data-hook': 'review-body'}).text

        # Armazenar as informações da review no dicionário
        reviews_data.append({
            'title': title,
            'rating': rating,
            'text': text
        })
    '''
    # Criar um DataFrame com as informações das reviews
    reviews_df = pd.DataFrame(reviews_data)

    # Salvar o DataFrame em um arquivo CSV
    reviews_df.to_csv('amazon_reviews.csv', index=False)

else:
    print(f"Erro ao acessar a página: {response.status_code}")

ValueError: Length of values (3) does not match length of index (8)

## Sentiment Analysis

In [None]:
!pip install nltk



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [None]:
df = pd.read_csv()

In [None]:
df.head()

In [None]:
ax = df['Score'].value_counts().sort_index().plot(kind='bar', title='', figsize=(10,5))
ax.set_xlabel('')
plt.show()

### Basic NLTK

In [None]:
example = df['Text'][50]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)

In [None]:
#token pos
tagged = nltk.pos_tag(tokens)

In [None]:
#
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint() #pretty print

### Vader Sentiment Scoring

In [None]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy!')

{'neg': 0.0, 'neu': 0.318, 'pos': 0.682, 'compound': 0.6468}

In [None]:
sia.polarity_scores('This is the worst thing ever')

{'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}

In [None]:
sia.polarity_scores(example)

In [None]:
# Run on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
  text = row['Text']
  myid = row['Id']
  res[myid] = sia.polarity_scores(text)

In [None]:
from operator import index
vaders = pd.DataFrame(res).T
vaders.reset_index().rename(columns={'index': 'id'})
vaders = vaders.merge(df, how='left')

In [None]:
# Sentiment Score and metadata
vaders

In [None]:
ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compound score by Amazon Star Review')
plt.show()

In [None]:
fig, axs = plt.subplotes(1, 3, figsize=(15, 5))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

### Pretrained Model

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# Run for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

In [None]:
def polarity_scores_roberta(example):
  encoded_text = tokenizer(example, return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
  }

  return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
  try:
    text = row['Text']
    myid = row['Id']
    vader_result = sia.polarity_scores()
    vader_result_rename = {}
    for key, value in vader_result.items():
      vader_result_rename[f"vader_{key}"] = value
    roberta_result = polarity_scores_roberta(text)
    both = {**vader_result, **roberta_result}
    res[myid] = both
  except RuntimeError:
    print(f'Broke for id {myid}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'id'})
results_df = results_df.merge(df, how='left')

### Combine and Compare

In [None]:
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='Score',
            palette='tab10')
plt.show()

In [None]:
results_df.query('Score == 1') \
    .sort_values('roberta_pos', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5') \
    .sort_values('roberta_neg', ascending=False)['Text'].values[0]

## Extra: the transformers pipeline

In [1]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [2]:
sent_pipeline('I love sentiment analysis!')

[{'label': 'POSITIVE', 'score': 0.9997853636741638}]

In [3]:
sent_pipeline('booo')

[{'label': 'NEGATIVE', 'score': 0.9936267137527466}]

In [5]:
multilingual_classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
multilingual_classifier("Custo benefício bom, mas por ter escolhido o de cor preto não gostei que a fonte de energia tenha vindo na cor branca, ficou um tanto destoante.")

[{'label': '3 stars', 'score': 0.5764768123626709}]

In [8]:
multilingual_classifier('Gostei da Echo Pop ...qualidade do material muito boa, recursos da Alexa ok...mas não espere um som potente como o da Echo dot por exemplo....mas pelo custo vale!')

[{'label': '4 stars', 'score': 0.5807279348373413}]

In [10]:
multilingual_classifier('Produto chegou corretamente, funcionando normal. Mas não gostei da qualidade do som, para quem gosta de ouvir bem os graves/baixo a echo pop é muito ruim nesse quesito. Tentei ajusta as configurações de áudio no app da alexa e tb usar um app de equalizador, mesmo assim eu particularmente não gostei. Vou devolver o produto.')

[{'label': '2 stars', 'score': 0.5304345488548279}]

## Using Textblob

In [None]:
!pip install -U textblob

In [None]:
!pip install deep-translator

In [28]:
from textblob import TextBlob
from deep_translator import GoogleTranslator

In [30]:
tradutor = GoogleTranslator(source= "pt", target= "en")

In [34]:
text = "Eu ODEIO essa música!"
traducao = tradutor.translate(text)
blob = TextBlob(traducao)

In [None]:
# criar uma coluna com as reviews traduzidas para o inglês

In [35]:
sentiment = blob.sentiment
print(sentiment)

Sentiment(polarity=-1.0, subjectivity=0.9)


In [36]:
polarity = blob.sentiment.polarity
subjectivity = blob.sentiment.subjectivity

print(polarity)
print(subjectivity)

-1.0
0.9


In [None]:
def sentences_tokenization(text):
  sentences = []
  blob = TextBlob(text)
  for sentence in blob.sentences:
    sentences.append(str(sentence))

  return sentences

In [None]:
df['sentences_pt'] = df['reviews'].apply(sentences_tokenization)

In [None]:
def sentiment_analysis(sentences):
  sentiments = []
  for sentence in sentences:
    blob = TextBlob(sentence)
    sentiment = blob.sentiment.polarity
    sentiments.append(sentiment)

  return sentiments

In [None]:
df['sentences_sentiments'] = df['sentences_en'].apply(sentiment_analysis)

In [None]:
df['positive_sentences'] = df.apply(lambda x: [frase for frase, sentimento in zip(x['sentences_pt'], x['sentences_sentiments']) if sentimento > 0], axis=1)

In [None]:
df['negative_sentences'] = df.apply(lambda x: [frase for frase, sentimento in zip(x['sentences_pt'], x['sentences_sentiments']) if sentimento < 0], axis=1)

## Fontes/Ideias

Análise de sentimentos em português utilizando Pytorch e Python

* [Parte 1](https://medium.com/data-hackers/an%C3%A1lise-de-sentimentos-em-portugu%C3%AAs-utilizando-pytorch-e-python-91a232165ec0)

* [Parte 2](https://medium.com/data-hackers/deploy-de-um-modelo-de-an%C3%A1lise-de-sentimentos-como-uma-rest-api-878c175ad24f)

---

[Construindo um analisador de sentimentos em python](https://acadianschool.com.br/construindo-um-analisador-de-sentimentos-em-python/)

---

[AIpp](https://github.com/TailUFPB/AIpp/tree/main)

---

[tradutor de texto em python](https://www.hashtagtreinamentos.com/tradutor-de-texto-em-python#:~:text=Utilizaremos%20a%20biblioteca%20Deep%2Dtranslator,podemos%20fazer%20no%20pr%C3%B3prio%20site.)