In [1]:
# Install the necessary libraries if they weren't already installed
!pip install pandas transformers scipy

# Install necessary libraries:
import pandas as pd
import numpy as np
from transformers import pipeline, DistilBertTokenizerFast

!python --version

# For Sentiment Analysis:
# prefix the command with ! to run it as a shell command in Jupyter Notebook
#!pip install transformers
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install tensorflow

# Install PyTorch:
#!pip install torch torchvision torchaudio transformers pandas --no-cache-dir
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
!pip install torch torchvision torchaudio --no-cache-dir


Python 3.10.12


# What is the sentiment of the reviews, sorted by language?

We selected the Amazon Reviews Multilingual Dataset as our dataset: https://www.kaggle.com/datasets/mexwell/amazon-reviews-multi/data. It depicts a comprehensive collection of multilingual product reviews in CSV format. This dataset contains about 1.3 million samples in 6 languages (DE = German, EN = English, ES = Spanish, FR = French, JA = Japanese, ZH = Chinese) with the following features:

**review_id**:          A string identifier of the review.

**product_id**:         A string identifier of the product being reviewed.

**reviewer_id**:        A string identifier of the reviewer.

**stars**:              An int between 1-5 indicating the number of stars.

**review_body**:        The text body of the review.

**review_title**:       The text title of the review.

**language**:           The string identifier of the review language.

**product_category**:   String representation of the product's category.

The data will be directly downloaded and processed from the CSV files provided by the dataset. During preprocessing we will focus on removing stop words as well as choosing the correct embedding for the reviews.

#### Splitting:

The data was split into sentiment labels based on a star rating, where:

**1 (Positive): stars > 3**

**0 (Negative): stars <= 3**

## 4) Prepare Files

Split the big csv-file into smaller files according to the language:

#### es (Spanish)

In [2]:
# # filter df for Spanish reviews
# spanish_reviews = df[df['language'] == 'es']

# # save them to a new csv file
# output_file = 'reviews_es.csv'
# spanish_reviews.to_csv(output_file, index=False)

# print(f"Saved Spanish reviews to {output_file}")

## 5) Sentiment Analysis

##### BERT can only read comments up to 512 chars so longer comments need to be tokenized:

#### es (Spanish)

In [3]:
from transformers import AutoTokenizer, pipeline
import pandas as pd
from sklearn.metrics import accuracy_score  # For accuracy calculation

# load CSV file
file_path = 'reviews_es.csv'
df = pd.read_csv(file_path)

# filter for Spanish reviews (actually redundant but didn't want to run again)
spanish_reviews = df[df['language'] == 'es']

# define sentiment label function based on stars
def sentiment_label(stars):
    return 1 if stars > 3 else 0

# apply sentiment label function to create the true_label column
spanish_reviews['true_label'] = spanish_reviews['stars'].apply(sentiment_label)

# init tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

# init sentiment analysis pipeline for Spanish
sentiment_pipeline = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", framework="pt")

# function to truncate reviews to a maximum length of 512 tokens using the tokenizer (BERT limitation)
def truncate_review(review, max_length=512):
    if not isinstance(review, str):
        return ""
    tokens = tokenizer(review, truncation=True, max_length=max_length, return_tensors='pt')['input_ids'][0]
    return tokenizer.decode(tokens, skip_special_tokens=True)

# apply truncation to the review column
spanish_reviews['review_body'] = spanish_reviews['review_body'].apply(truncate_review)

# sentiment analysis of the reviews:
sentiments = sentiment_pipeline(spanish_reviews['review_body'].tolist(), batch_size=32)  # Use batching for speed

# convert predicted labels to binary format (0 = negative, 1 = positive)
spanish_reviews['predicted_label'] = [1 if '4' in s['label'] or '5' in s['label'] else 0 for s in sentiments]

# accuracy
accuracy = accuracy_score(spanish_reviews['true_label'], spanish_reviews['predicted_label'])
print(f"Accuracy: {accuracy * 100:.2f}%")

# save the results
output_file_path = 'spanish_reviews_with_sentiments_and_accuracy.csv'
spanish_reviews.to_csv(output_file_path, index=False)

print(f"Sentiment analysis complete and saved to '{output_file_path}'.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

Device set to use cuda:0


Accuracy: 88.17%
Sentiment analysis complete and saved to 'spanish_reviews_with_sentiments_and_accuracy.csv'.


## 6) Accuracy

In [4]:
# Load the CSV file with sentiment analysis results
result_file_path = 'spanish_reviews_with_sentiments_and_accuracy.csv'
result_df = pd.read_csv(result_file_path)

# Recalculate accuracy to ensure correctness
overall_accuracy = accuracy_score(result_df['true_label'], result_df['predicted_label'])

# Print the average accuracy
print(f"Overall Average Accuracy: {overall_accuracy * 100:.2f}%")

Overall Average Accuracy: 88.17%
