In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Processing the scraped data

In [None]:
full_df = pd.read_csv("./data/full_dataset.csv",delimiter=",",encoding="utf8")

print(full_df.columns)
#Create a new dataset containing only reviews with Language 'de'
df_de = full_df[full_df['Language'] == 'de']

Index(['URL', 'Category', 'Title', 'Review', 'Rating', 'Language',
       'Review Number', 'Page Number', 'Review_Length'],
      dtype='object')


In [3]:
print(df_de['Rating'].value_counts())

Rating
5    207177
1     80995
4     27809
3      8352
2      7918
Name: count, dtype: int64


In [4]:
# Remove NaN or empty strings
df_de = df_de.dropna(subset=['Review'])
df_de = df_de[df_de['Review'].str.strip() != '']
df_de = df_de[df_de['Review'].str.split().str.len() >= 3]

Remove reviews that have little info

In [5]:
low_info_phrases = [
    'gut', 'sehr gut', 'ok', 'super', 'toll', 'klasse', 'nicht schlecht', 
    'passt', 'in ordnung', 'zufrieden', 'alles gut', 'empfehlenswert', 
    'schnell', 'top', 'prima', 'geht so', 'naja', 'schlecht', 'geht', 
    'super service', 'alles bestens', 'hat gepasst'
]

# Lowercase and strip each review
df_de['Review_clean'] = df_de['Review'].str.lower().str.strip()

# Remove reviews that exactly match low-info phrases
df_de = df_de[~df_de['Review_clean'].isin(low_info_phrases)]

# Optional: Remove reviews that only contain one of the phrases
df_de = df_de[~df_de['Review_clean'].apply(lambda x: any(p in x for p in low_info_phrases) and len(x.split()) <= 3)]

# Drop the helper column if needed
df_de = df_de.drop(columns=['Review_clean'])

In [6]:
print(df_de['Rating'].value_counts())

Rating
5    191381
1     80435
4     26248
3      8251
2      7875
Name: count, dtype: int64


Downsample the dataset

In [7]:
# Separate the classes
df_rating_5 = df_de[df_de['Rating'] == 5]
df_rating_4 = df_de[df_de['Rating'] == 4]
df_rating_3 = df_de[df_de['Rating'] == 3]
df_rating_2 = df_de[df_de['Rating'] == 2]
df_rating_1 = df_de[df_de['Rating'] == 1]

# Undersample labels 5 and 1 to 50,000
df_rating_5 = df_rating_5.sample(n=50000, random_state=42)
df_rating_1 = df_rating_1.sample(n=50000, random_state=42)

# Combine all into a new balanced DataFrame
df_downsampled_inbalanced = pd.concat([df_rating_5, df_rating_4, df_rating_3, df_rating_2, df_rating_1])

# Shuffle the dataset
df_downsampled_inbalanced = df_downsampled_inbalanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_downsampled_inbalanced['Rating'].value_counts())

Rating
1    50000
5    50000
4    26248
3     8251
2     7875
Name: count, dtype: int64


Add augmented data to main dataset

In [8]:
augmented_df = pd.read_csv('./data/augmented_reviews.csv',delimiter=",", encoding="utf8")
print(augmented_df.head())

# Identify the missing columns in augmented_df.
missing_cols = set(df_downsampled_inbalanced.columns) - set(augmented_df.columns)

# Add the missing columns to augmented_df, filling them with NaN.
for col in missing_cols:
    augmented_df[col] = np.nan  # or use an appropriate default value

# Reorder augmented_df to match the column order of df_de.
augmented_df = augmented_df[df_downsampled_inbalanced.columns]

# Concatenate the two DataFrames.
df_downsampled_inbalanced_augmented = pd.concat([df_downsampled_inbalanced, augmented_df], ignore_index=True)


                                              Review  Rating
0  Alles wird auch die Post teurer... aber das Mo...       3
1  ['Alles wird teurer auch der Beitrag...aber di...       3
2  Hier weiß die linke Hand nicht, was die rechte...       2
3  ['Hier weiß die linke Hand nicht, was die rech...       2
4  Sehr umständlich, wenn man als Lehrling eine S...       3


In [9]:
print(df_downsampled_inbalanced_augmented['Rating'].value_counts())

Rating
1    50000
5    50000
4    26248
3    24753
2    23625
Name: count, dtype: int64


Create Dataset

In [10]:

# Split into train (80%) and temp (20%)
df_train, df_temp = train_test_split(
    df_downsampled_inbalanced_augmented,
    test_size=0.2,
    stratify=df_downsampled_inbalanced_augmented['Rating'],
    random_state=42
)

# Split temp into validation (10%) and test (10%)
df_val, df_test = train_test_split(
    df_temp,
    test_size=0.5,
    stratify=df_temp['Rating'],
    random_state=42
)

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")
print(f"Test shape: {df_test.shape}")

# Optional: Check rating distributions
print("Train rating distribution:")
print(df_train['Rating'].value_counts(normalize=True))

print("\nValidation rating distribution:")
print(df_val['Rating'].value_counts(normalize=True))

print("\nTest rating distribution:")
print(df_test['Rating'].value_counts(normalize=True))

Train shape: (139700, 9)
Validation shape: (17463, 9)
Test shape: (17463, 9)
Train rating distribution:
Rating
1    0.286328
5    0.286328
4    0.150308
3    0.141747
2    0.135290
Name: proportion, dtype: float64

Validation rating distribution:
Rating
1    0.286320
5    0.286320
4    0.150318
3    0.141785
2    0.135257
Name: proportion, dtype: float64

Test rating distribution:
Rating
1    0.286320
5    0.286320
4    0.150318
3    0.141728
2    0.135315
Name: proportion, dtype: float64


In [11]:
df_train.to_csv("train_inbalanced.csv", index=False)
df_val.to_csv("val_inbalanced.csv", index=False)
df_test.to_csv("test_inbalanced.csv", index=False)

In [12]:
# Example: Using a German BERT
model_name = "dbmdz/bert-base-german-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [13]:
df_train_encoded = tokenizer(
    df_train['Review'].tolist(),       # a list of strings (the reviews)
    padding=True,          # pad to longest in batch
    truncation=True,       # truncate if it’s longer than the model’s max length
    return_tensors="pt"    # return PyTorch tensors
)

df_val_encoded = tokenizer(
    df_val['Review'].tolist(),       # a list of strings (the reviews)
    padding=True,          # pad to longest in batch
    truncation=True,       # truncate if it’s longer than the model’s max length
    return_tensors="pt"    # return PyTorch tensors
)

df_test_encoded = tokenizer(
    df_test['Review'].tolist(),       # a list of strings (the reviews)
    padding=True,          # pad to longest in batch
    truncation=True,       # truncate if it’s longer than the model’s max length
    return_tensors="pt"    # return PyTorch tensors
)

In [14]:
torch.save(df_train_encoded, "train_inbalanced_tokenized.pt")
torch.save(df_val_encoded, "val_inbalanced_tokenized.pt")
torch.save(df_test_encoded, "test_inbalanced_tokenized.pt")

In [15]:
# Separate the classes
df_rating_5 = df_downsampled_inbalanced_augmented[df_downsampled_inbalanced_augmented['Rating'] == 5]
df_rating_4 = df_downsampled_inbalanced_augmented[df_downsampled_inbalanced_augmented['Rating'] == 4]
df_rating_3 = df_downsampled_inbalanced_augmented[df_downsampled_inbalanced_augmented['Rating'] == 3]
df_rating_2 = df_downsampled_inbalanced_augmented[df_downsampled_inbalanced_augmented['Rating'] == 2]
df_rating_1 = df_downsampled_inbalanced_augmented[df_downsampled_inbalanced_augmented['Rating'] == 1]

# Undersample labels 5 and 1 to 50,000
df_rating_5 = df_rating_5.sample(n=23625, random_state=42)
df_rating_4 = df_rating_4.sample(n=23625, random_state=42)
df_rating_3 = df_rating_3.sample(n=23625, random_state=42)
df_rating_1 = df_rating_1.sample(n=23625, random_state=42)

# Combine all into a new balanced DataFrame
df_downsampled_balanced_augmented = pd.concat([df_rating_5, df_rating_4, df_rating_3, df_rating_2, df_rating_1])

# Shuffle the dataset
df_downsampled_balanced_augmented = df_downsampled_balanced_augmented.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_downsampled_balanced_augmented['Rating'].value_counts())

Rating
1    23625
2    23625
5    23625
4    23625
3    23625
Name: count, dtype: int64


In [16]:
# Split into train (80%) and temp (20%)
df_train, df_temp = train_test_split(
    df_downsampled_balanced_augmented,
    test_size=0.2,
    stratify=df_downsampled_balanced_augmented['Rating'],
    random_state=42
)

# Split temp into validation (10%) and test (10%)
df_val, df_test = train_test_split(
    df_temp,
    test_size=0.5,
    stratify=df_temp['Rating'],
    random_state=42
)

print(f"Train shape: {df_train.shape}")
print(f"Validation shape: {df_val.shape}")
print(f"Test shape: {df_test.shape}")

# Optional: Check rating distributions
print("Train rating distribution:")
print(df_train['Rating'].value_counts(normalize=True))

print("\nValidation rating distribution:")
print(df_val['Rating'].value_counts(normalize=True))

print("\nTest rating distribution:")
print(df_test['Rating'].value_counts(normalize=True))

Train shape: (94500, 9)
Validation shape: (11812, 9)
Test shape: (11813, 9)
Train rating distribution:
Rating
2    0.2
4    0.2
3    0.2
1    0.2
5    0.2
Name: proportion, dtype: float64

Validation rating distribution:
Rating
5    0.200051
2    0.200051
1    0.199966
4    0.199966
3    0.199966
Name: proportion, dtype: float64

Test rating distribution:
Rating
3    0.200034
4    0.200034
1    0.200034
2    0.199949
5    0.199949
Name: proportion, dtype: float64


In [17]:
df_train.to_csv("train_balanced_5_classes.csv", index=False)
df_val.to_csv("val_balanced_5_classes.csv", index=False)
df_test.to_csv("test_balanced_5_classes.csv", index=False)

In [18]:
df_train_encoded = tokenizer(
    df_train['Review'].tolist(),       # a list of strings (the reviews)
    padding=True,          # pad to longest in batch
    truncation=True,       # truncate if it’s longer than the model’s max length
    return_tensors="pt"    # return PyTorch tensors
)

df_val_encoded = tokenizer(
    df_val['Review'].tolist(),       # a list of strings (the reviews)
    padding=True,          # pad to longest in batch
    truncation=True,       # truncate if it’s longer than the model’s max length
    return_tensors="pt"    # return PyTorch tensors
)

df_test_encoded = tokenizer(
    df_test['Review'].tolist(),       # a list of strings (the reviews)
    padding=True,          # pad to longest in batch
    truncation=True,       # truncate if it’s longer than the model’s max length
    return_tensors="pt"    # return PyTorch tensors
)

In [19]:
torch.save(df_train_encoded, "train_balanced_tokenized_5_classes.pt")
torch.save(df_val_encoded, "val_balanced_tokenized_5_classes.pt")
torch.save(df_test_encoded, "test_balanced_tokenized_5_classes.pt")