In [11]:
from commons import ProductType, Sentiment
from typing import Sequence, Dict
import pandas as pd
from sklearn.model_selection import train_test_split
from amazon_scrapping import get_scrapped_reviews, filter_and_format_reviews



In [12]:
label_to_numeric: Dict[str, int] = {"POSITIVE": 0, "NEUTRAL": 1, "NEGATIVE": 2}

In [13]:
reviews_with_labels: list[tuple[str, Sentiment]] = []

for product in ProductType:
    raw_reviews = get_scrapped_reviews(product_type=product, inout_folder="scrapped_data")
    reviews_with_labels.extend(filter_and_format_reviews(raw_reviews=raw_reviews, suppress_errors=True))

reviews: Sequence[str] = [rev_label[0] for rev_label in reviews_with_labels]
labels: Sequence[int] = [label_to_numeric[rev_label[1]] for rev_label in reviews_with_labels]

In [14]:
df_text: pd.DataFrame = pd.DataFrame({"text": reviews})
df_labels: pd.DataFrame = pd.DataFrame({"label": labels})

X_train, X_test, Y_train, Y_test = train_test_split(df_text, df_labels, test_size=0.25, random_state=100)


df_train: pd.DataFrame = pd.concat([X_train, Y_train], axis=1)
df_test: pd.DataFrame = pd.concat([X_test, Y_test], axis=1)

print(df_labels.value_counts())
print(df_train["label"].value_counts())
print(df_test["label"].value_counts())

df_train.to_csv("./csv_data/train_more_neutral.csv", index=False)
df_test.to_csv("./csv_data/test_more_neutral.csv", index=False)
df_train[0:10]

label
0        19867
2         2206
1         1181
dtype: int64
0    14894
2     1655
1      891
Name: label, dtype: int64
0    4973
2     551
1     290
Name: label, dtype: int64


Unnamed: 0,text,label
22144,"Dieses Waschpulver ist sehr ergiebig, riecht g...",0
12287,Super dicht und hält das Getränk lange heiß! M...,0
1760,Habe 2 von diesen Bechern einen für Tee und de...,0
10780,Hält sehr lange.,0
7189,"Voll zufrieden. Er ist dicht, hält warm und ha...",0
21542,"Diese Pods, oder wie sie bei Spee heißen Caps ...",0
13334,Sehr gute Qualität. Hält die Wärme sehr gut u...,0
12673,Bisher immer dicht gewesen. Getränk bleibt ult...,0
17019,Macht das Geschirr sauber. Unterschiede zur Di...,0
2412,Tee/Kaffee bleibt stundenlang warm,0


In [None]:
df_train = pd.read_csv("./csv_data/train_more_neutral.csv")
df_test = pd.read_csv("./csv_data/test_more_neutral.csv")

In [None]:
df_train_0 = df_train[df_train["label"] == 0].sample(frac=1.0)
df_train_1 = df_train[df_train["label"] == 1].sample(frac=1.0)
df_train_2 = df_train[df_train["label"] == 2].sample(frac=1.0)

min_train = min(df_train["label"].value_counts())
min_test = min(df_test["label"].value_counts())

df_train_0 = df_train_0[0:min_train]
df_train_1 = df_train_1[0:min_train]
df_train_2 = df_train_2[0:min_train]

df_test_0 = df_test[df_test["label"] == 0].sample(frac=1.0)
df_test_1 = df_test[df_test["label"] == 1].sample(frac=1.0)
df_test_2 = df_test[df_test["label"] == 2].sample(frac=1.0)
df_test_0 = df_test_0[0:min_test]
df_test_1 = df_test_1[0:min_test]
df_test_2 = df_test_2[0:min_test]

df_test_balanced = pd.concat((df_test_0, df_test_1, df_test_2))
df_train_balanced = pd.concat((df_train_0, df_train_1, df_train_2))
print(df_test_balanced["label"].value_counts())
print(df_train_balanced["label"].value_counts())
df_train_balanced.to_csv("./csv_data/train_more_neutral_balanced.csv", index=False)
df_test_balanced.to_csv("./csv_data/test_more_neutral_balanced.csv", index=False)

In [None]:
df_train = pd.read_csv("./csv_data/train.csv")
df_test = pd.read_csv("./csv_data/test.csv")

X = list(df_train["text"])
X_test = list(df_test["text"])
Y = list(df_train["label"])
Y_test = list(df_test["label"])