In [1]:
#!pip install transformers
#!pip install datasets
#!pip install evaluate

from consts import ProductType, Sentiment
from amazon_scrapping import get_scrapped_reviews, filter_and_format_reviews

import transformers
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from transformers import AutoModelForSequenceClassification
import pandas as pd
from typing import Sequence, Dict
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate
import pickle



In [2]:
label_to_numeric: Dict[str, int] = {"POSITIVE": 0, "NEUTRAL": 1, "NEGATIVE": 2}

In [6]:
reviews_with_labels: list[tuple[str, Sentiment]] = []

for product in ProductType:
    raw_reviews = get_scrapped_reviews(product_type=product, inout_folder="scrapped_data")
    reviews_with_labels.extend(filter_and_format_reviews(raw_reviews=raw_reviews, suppress_errors=True))

reviews: Sequence[str] = [rev_label[0] for rev_label in reviews_with_labels]
labels: Sequence[int] = [label_to_numeric[rev_label[1]] for rev_label in reviews_with_labels]

reviews = list(map(lambda x: x.replace("\"", ""), reviews))

In [7]:
df_text: pd.DataFrame = pd.DataFrame({"text": reviews})
df_labels: pd.DataFrame = pd.DataFrame({"label": labels})

X_train, X_test, Y_train, Y_test = train_test_split(df_text, df_labels, test_size=0.25, random_state=100)


df_train: pd.DataFrame = pd.concat([X_train, Y_train], axis=1)
df_test: pd.DataFrame = pd.concat([X_test, Y_test], axis=1)

print(df_train["label"].value_counts())
print(df_test["label"].value_counts())

df_train.to_csv("./csv_data/train.csv", index=False)
df_test.to_csv("./csv_data/test.csv", index=False)
df_train[0:10]

0    9354
1    1895
2    1777
Name: label, dtype: int64
0    3124
1     635
2     584
Name: label, dtype: int64


Unnamed: 0,text,label
8855,Auch bei mir klemmt nach knapp 2 Monaten Benut...,2
1043,Nach langer Suche und ausprobieren endlich gef...,0
16857,"Mir sagt dieses Waschmittel, Lenor Colorwaschm...",1
10077,Der Bobicon Thermobecher Edelstahl 380ml - Kaf...,1
4579,Super Kaffeebecher für unterernährt und leicht...,2
3726,"sieht schick aus, ist auch praktisch zum Mitne...",0
13761,"habe seit 3 Wochen die Pads, leider waren bish...",1
9315,Optisch sehr schön und bis jetzt auch noch dic...,0
7670,Hält nicht dich und leider auch nicht warm.,2
16495,Der Preis war sehr gut. In hoher Dosierung wer...,0


In [32]:
df_train = pd.read_csv("./csv_data/train.csv")
df_test = pd.read_csv("./csv_data/test.csv")

dataset_train: Dataset = Dataset.from_pandas(df_train)
dataset_test: Dataset = Dataset.from_pandas(df_test)

In [39]:
tokenizer: transformers.models.bert.tokenization_bert_fast.BertTokenizerFast = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")

def tokenize_function(data: Dataset) -> Dataset:
    return tokenizer(data["text"], padding="max_length", truncation=True)

train_dataset_tokenized: Dataset = dataset_train.map(tokenize_function, batched=True)
test_dataset_tokenized: Dataset = dataset_test.map(tokenize_function, batched=True)

Downloading:   0%|          | 0.00/161 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [41]:
dataset_train

Dataset({
    features: ['text', 'label'],
    num_rows: 13026
})

In [40]:
model = AutoModelForSequenceClassification.from_pretrained("oliverguhr/german-sentiment-bert")

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [46]:
from datasets import load_metric

def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("accuracy")
    metric4 = load_metric("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels, average='micro')["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average='micro')["recall"]
    acc = metric3.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = metric4.compute(predictions=predictions, references=labels, average='micro')["f1"]
    return {"precision": precision, "recall": recall, "accuracy": acc, "f1": f1}

In [43]:
training_args: TrainingArguments = TrainingArguments(output_dir="checkpoints", evaluation_strategy="epoch", num_train_epochs=1, per_device_train_batch_size=1)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=test_dataset_tokenized,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.evaluate()

In [47]:
predictions = trainer.predict(test_dataset_tokenized)
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 4343
  Batch size = 8


KeyboardInterrupt: 

In [15]:
from datasets import load_metric

metric = load_metric("accuracy")
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.7271435338702037}

In [40]:
idx = 16
print(dataset_test['text'][idx])
print(dataset_test['label'][idx])

Ich mag die Tabs. Einfach im Gebrauch und machen das Geschirr in Zusammenarbeit mit einer Geschirrspülmaschine schön sauber.
2
