In [1]:
# Install the necessary libraries if they weren't already installed
!pip install pandas transformers scipy

# Install necessary libraries:
import pandas as pd
import numpy as np
from transformers import pipeline, DistilBertTokenizerFast

!python --version

# For Sentiment Analysis:
# prefix the command with ! to run it as a shell command in Jupyter Notebook
#!pip install transformers
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install tensorflow

# Install PyTorch:
#!pip install torch torchvision torchaudio transformers pandas --no-cache-dir
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
#!pip install torch torchvision torchaudio --no-cache-dir
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
!pip install torch torchvision torchaudio --no-cache-dir

# for Japanese:
!pip install fugashi # handles Japanese tokenization
!pip install ipadic # provides a Japanese dictionary
!pip install unidic_lite # dictionary required by tokenizer for Japanese text handling

Python 3.10.12


# What is the sentiment of the reviews, sorted by language?

We selected the Amazon Reviews Multilingual Dataset as our dataset: https://www.kaggle.com/datasets/mexwell/amazon-reviews-multi/data. It depicts a comprehensive collection of multilingual product reviews in CSV format. This dataset contains about 1.3 million samples in 6 languages (DE = German, EN = English, ES = Spanish, FR = French, JA = Japanese, ZH = Chinese) with the following features:

**review_id**:          A string identifier of the review.

**product_id**:         A string identifier of the product being reviewed.

**reviewer_id**:        A string identifier of the reviewer.

**stars**:              An int between 1-5 indicating the number of stars.

**review_body**:        The text body of the review.

**review_title**:       The text title of the review.

**language**:           The string identifier of the review language.

**product_category**:   String representation of the product's category.

The data will be directly downloaded and processed from the CSV files provided by the dataset. During preprocessing we will focus on removing stop words as well as choosing the correct embedding for the reviews.

#### Splitting:

We want to use the star rating as the ground truth for sentiment, where:

Stars > 3 → Positive sentiment (1)

Stars ≤ 3 → Negative sentiment (0)

## 0) Load DataFrame:

In [2]:
# Read csv-file:
# file_path = 'train.csv'
# df = pd.read_csv(file_path) # dataframe

# columns: id, review_id, product_id, reviewer_id, stars, review_body, review_title, language, product_category

## 4) Prepare Files

Split the big csv-file into smaller files according to the language:

#### ja (Japanese)

In [3]:
# # filter df for Japanese reviews
# japanese_reviews = df[df['language'] == 'ja']

# # save them to a new csv file
# output_file = 'reviews_ja.csv'
# japanese_reviews.to_csv(output_file, index=False)

# print(f"Saved Japanese reviews to {output_file}")

## 5) Sentiment Analysis

##### BERT can only read comments up to 512 chars so longer comments need to be tokenized:

#### ja (Japanese)

In [5]:
from transformers import AutoTokenizer, pipeline
import pandas as pd
from sklearn.metrics import accuracy_score

# load CSV file and filter for Japanese reviews
file_path = 'reviews_ja.csv'
df = pd.read_csv(file_path)
japanese_reviews = df[df['language'] == 'ja']

# define the sentiment label function based on stars: Stars > 3 → Positive sentiment (1), else negative
def sentiment_label(stars):
    return 1 if stars > 3 else 0

# apply sentiment label function to create the true_label column
japanese_reviews['true_label'] = japanese_reviews['stars'].apply(sentiment_label)

# init tokenizer
#tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
#tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-lite")
#tokenizer = AutoTokenizer.from_pretrained("ku-nlp/deberta-v2-japanese-lite")
#tokenizer = AutoTokenizer.from_pretrained("transformersbook/bert-base-japanese-sentiment")
tokenizer = AutoTokenizer.from_pretrained("minutillamolinara/bert-japanese_finetuned-sentiment-analysis")

# init sentiment analysis pipeline for Japanese
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="minutillamolinara/bert-japanese_finetuned-sentiment-analysis",
    tokenizer=tokenizer,
    framework="pt",
    truncation=True,
    max_length=512,
    device=0  # indicates using GPU if available
)

# sentiment analysis on the review column
sentiments = sentiment_pipeline(japanese_reviews['review_body'].tolist(), batch_size=32)

# convert predicted labels to binary format (0 = negative, 1 = positive)
japanese_reviews['predicted_label'] = [1 if s['label'].lower() == 'positive' else 0 for s in sentiments]

# accuracy
accuracy = accuracy_score(japanese_reviews['true_label'], japanese_reviews['predicted_label'])
print(f"Accuracy: {accuracy * 100:.2f}%")

# save results to a csv file
japanese_reviews.to_csv('japanese_reviews_with_sentiments_and_accuracy.csv', index=False)

print("Sentiment analysis complete and saved to 'japanese_reviews_with_sentiments_and_accuracy.csv'.")


tokenizer_config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/236k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Device set to use cuda:0


Accuracy: 82.19%
Sentiment analysis complete and saved to 'japanese_reviews_with_sentiments_and_accuracy.csv'.


## 6) Accuracy

In [6]:
# create new CSV file
japanese_reviews.to_csv('japanese_reviews_with_sentiments_and_accuracy.csv', index=False)

print("Sentiment analysis complete and saved to 'japanese_reviews_with_sentiments_and_accuracy.csv'.")

# recalculate accuracy from the saved file to ensure correctness
result_file_path = 'japanese_reviews_with_sentiments_and_accuracy.csv'
result_df = pd.read_csv(result_file_path)

# calculate overall average accuracy
overall_accuracy = accuracy_score(result_df['true_label'], result_df['predicted_label'])
print(f"Overall Average Accuracy: {overall_accuracy * 100:.2f}%")


Sentiment analysis complete and saved to 'japanese_reviews_with_sentiments_and_accuracy.csv'.
Overall Average Accuracy: 82.19%
