In [44]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [45]:
# Initialize a tokenizer using a pre-trained BERT model for multilingual sentiment analysis
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [46]:
# Initialize a model for sequence classification using a pre-trained BERT model for multilingual sentiment analysis
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [47]:
# Define a text statement for sentiment analysis
statement = 'It is ok product, nothing special'

In [48]:
# Encode the input text statement into tokens using the tokenizer
# The return_tensors parameter specifies that the tokens should be returned as PyTorch tensors
tokens = tokenizer.encode(statement, return_tensors='pt')

In [49]:
tokens

tensor([[  101, 10197, 10127, 13563, 20058,   117, 20587, 11999,   102]])

In [50]:
# Decode the encoded tokens back into text using the tokenizer
decoded_text = tokenizer.decode(tokens[0])

In [51]:
# Use the model to perform sentiment analysis on the encoded tokens
# The model's output tensor is processed to obtain the predicted sentiment label
# The sentiment label is then converted to a numeric value and 1 is added to convert the position to score
sentiment = int(model(tokens)[0].detach().argmax(dim=1).item()) + 1

In [52]:
sentiment

3

In [53]:
# Read the CSV file 'Reviews.csv' and store it as a DataFrame
df = pd.read_csv('Reviews.csv')

In [54]:
# Sample 56845 random rows(10% of the data) from the DataFrame and reset the index
# The resulting DataFrame is copied to ensure independence from the original DataFrame
df = df.sample(56845).reset_index().copy()

In [55]:
# Display the dimensions of the DataFrame
print(df.shape)

(56845, 11)


In [56]:
# Extract the 'Text' column values from the DataFrame as an array
reviews = df.Text.values

In [57]:
# Display the dimensions of the 'reviews' array
print(reviews.shape)

(56845,)


In [58]:
def sentiment_analysis(review):
    # Encode the input review into tokens using the tokenizer
    tokens = tokenizer.encode(review, return_tensors='pt')
    
    # Use the model to perform sentiment analysis on the encoded tokens
    # The model's output tensor is processed to obtain the predicted sentiment label
    # The sentiment label is then converted to a numeric value by adding 1
    sentiment = int(model(tokens)[0].detach().argmax(dim=1).item()) + 1
    
    # Return the calculated sentiment value
    return sentiment

In [21]:
# Create an empty list to store predicted sentiments
prediction_lst = []

# Iterate through each review in the 'reviews' array
for review in tqdm(reviews):
    # Truncate the review if its length is more than 1000 characters
    if len(review) > 1000:
        review = review[:1000]
    
    # Remove any HTML-like formatting if present
    if '<br /><br />' in review:
        review = review.split('<br /><br />')[-1]
    
    # Perform sentiment analysis on the processed review and append the result to the list
    prediction_lst.append(sentiment_analysis(review))

  0%|          | 0/56845 [00:00<?, ?it/s]

In [59]:
# Convert the 'prediction_lst' list to a NumPy array
prediction_lst = np.array(prediction_lst)

In [60]:
# Create a DataFrame with the 'pred' column and calculate value counts
prediction_counts = pd.DataFrame(prediction_lst, columns=['pred']).value_counts()

In [61]:
prediction_counts

pred
5       26810
4       12921
3        6933
2        5533
1        4648
dtype: int64