In [1]:
# Install the necessary libraries if they weren't already installed
!pip install pandas transformers scipy

# Install necessary libraries:
import pandas as pd
import numpy as np
from transformers import pipeline, DistilBertTokenizerFast

!python --version

# For Sentiment Analysis:
# prefix the command with ! to run it as a shell command in Jupyter Notebook
#!pip install transformers
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install tensorflow

# Install PyTorch:
#!pip install torch torchvision torchaudio transformers pandas --no-cache-dir
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install torch torchvision torchaudio --no-cache-dir
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
!pip install torch torchvision torchaudio --no-cache-dir


Python 3.10.12


# What is the sentiment of the reviews, sorted by language?

We selected the Amazon Reviews Multilingual Dataset as our dataset: https://www.kaggle.com/datasets/mexwell/amazon-reviews-multi/data. It depicts a comprehensive collection of multilingual product reviews in CSV format. This dataset contains about 1.3 million samples in 6 languages (DE = German, EN = English, ES = Spanish, FR = French, JA = Japanese, ZH = Chinese) with the following features:

**review_id**:          A string identifier of the review.

**product_id**:         A string identifier of the product being reviewed.

**reviewer_id**:        A string identifier of the reviewer.

**stars**:              An int between 1-5 indicating the number of stars.

**review_body**:        The text body of the review.

**review_title**:       The text title of the review.

**language**:           The string identifier of the review language.

**product_category**:   String representation of the product's category.

The data will be directly downloaded and processed from the CSV files provided by the dataset. During preprocessing we will focus on removing stop words as well as choosing the correct embedding for the reviews.

#### Splitting:

The data was split into sentiment labels based on a star rating, where:

**1 (Positive): stars > 3**

**0 (Negative): stars <= 3**

## 4) Prepare Files

Split the big csv-file into smaller files according to the language:

#### de (German)

In [2]:
# # filter df for German reviews
# german_reviews = df[df['language'] == 'de']

# # Save the German reviews to a new CSV file
# output_file = 'reviews_de.csv'
# german_reviews.to_csv(output_file, index=False)

# print(f"Saved German reviews to {output_file}")

## 5) Sentiment Analysis

##### BERT can only read comments up to 512 chars so longer comments need to be tokenized:

#### de (German)

In [4]:
from transformers import AutoTokenizer, pipeline
import pandas as pd
from sklearn.metrics import accuracy_score

# load the provided csv file
file_path = 'reviews_de.csv'
df = pd.read_csv(file_path)

# filter for German reviews
german_reviews = df[df['language'] == 'de']

# define the sentiment label function based on stars
def sentiment_label(stars):
    return 1 if stars > 3 else 0

# apply sentiment label function to create the true_label column:
german_reviews['true_label'] = german_reviews['stars'].apply(sentiment_label)

# init tokenizer and the sentiment analysis pipeline for German:
# tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
# sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="af0f99b")
# tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased') # load a pretrained BERT tokenizer for German
#sentiment_pipeline = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert")
tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")
sentiment_pipeline = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", framework="pt")

# function to truncate reviews to a maximum length of 512 tokens using the tokenizer (BERT limitation)
def truncate_review(review, max_length=512):
    if not isinstance(review, str):
        return ""
    tokens = tokenizer(review, truncation=True, max_length=max_length, return_tensors='pt')['input_ids'][0]
    return tokenizer.decode(tokens, skip_special_tokens=True)

# apply truncation to the review column
german_reviews['review_body'] = german_reviews['review_body'].apply(truncate_review)

# perform sentiment analysis on the review column
sentiments = sentiment_pipeline(german_reviews['review_body'].tolist())

# convert predicted labels to binary format (0 = negative, 1 = positive)
german_reviews['predicted_label'] = [1 if s['label'].lower() == 'positive' else 0 for s in sentiments]

# calculate accuracy
accuracy = accuracy_score(german_reviews['true_label'], german_reviews['predicted_label'])
print(f"Accuracy: {accuracy * 100:.2f}%")

# save the results to a csv file
german_reviews.to_csv('german_reviews_with_sentiments_and_accuracy.csv', index=False)

print("Sentiment analysis complete and saved to 'german_reviews_with_sentiments_and_accuracy.csv'.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/161 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Device set to use cuda:0


Accuracy: 81.56%
Sentiment analysis complete and saved to 'german_reviews_with_sentiments_and_accuracy.csv'.


## 6) Accuracy

In [5]:
# Recalculate accuracy from the saved file to ensure correctness

#result_df = pd.read_csv(output_file_path)
result_file_path = 'german_reviews_with_sentiments_and_accuracy.csv'
result_df = pd.read_csv(result_file_path)

# Calculate overall average accuracy
overall_accuracy = accuracy_score(result_df['true_label'], result_df['predicted_label'])
print(f"Overall Average Accuracy: {overall_accuracy * 100:.2f}%")

Overall Average Accuracy: 81.56%
