# Data exploration

In [16]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0=all, 1=filter INFO, 2=filter WARNING, 3=filter ERROR
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.
hf_token = os.getenv("HF_TOKEN")
from datasets import load_dataset

### Load datasets

In [17]:
dataset_path = os.getenv('DATASET_PATH')
amazon_db = load_dataset( 'csv' , data_files={ 'train': dataset_path + '/train.csv', 'test': dataset_path + '/test.csv'  , 'validation': dataset_path + '/validation.csv' } )
amazon_db

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 1200000
    })
    test: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'review_id', 'product_id', 'reviewer_id', 'stars', 'review_body', 'review_title', 'language', 'product_category'],
        num_rows: 30000
    })
})

### Data exploration

In [18]:
amazon_db['train'][0]

{'Unnamed: 0': 0,
 'review_id': 'de_0203609',
 'product_id': 'product_de_0865382',
 'reviewer_id': 'reviewer_de_0267719',
 'stars': 1,
 'review_body': 'Armband ist leider nach 1 Jahr kaputt gegangen',
 'review_title': 'Leider nach 1 Jahr kaputt',
 'language': 'de',
 'product_category': 'sports'}

### Data cleaning

In [19]:
amazon_db = amazon_db.rename_column("stars", "label")
amazon_db = amazon_db.rename_column("review_body", "text")
amazon_db = amazon_db.remove_columns(["Unnamed: 0", 'review_id', 'product_id', 'reviewer_id', 'review_title', 'language', 'product_category'])  # Remove unnecessary index column

# for each train,test,validation list remove 1 from label to make it 0-4 instead of 1-5
def adjust_label(example):
    example['label'] = example['label'] - 1
    return example
amazon_db = amazon_db.map(adjust_label)

### Data filtering

In [None]:
# get only first n samples from train set for faster experimentation
k = 1200000
amazon_db['train'] = amazon_db['train'].shuffle(seed=42).select(range(k))
amazon_db['test'] = amazon_db['test'].shuffle(seed=42).select(range(min(30000 , k//6)))
amazon_db['validation'] = amazon_db['validation'].shuffle(seed=42).select(range(min(30000 , k//6)))

# count number of stars in train set
from collections import Counter
train_labels = [example['label'] for example in amazon_db['train']]
label_counts = Counter(train_labels)
print("Label distribution in training set:")
for label, count in sorted(label_counts.items()):
    print(f"Stars {label + 1}: {count} samples")

Label distribution in training set:
Stars 1: 199861 samples
Stars 2: 199772 samples
Stars 3: 200219 samples
Stars 4: 199874 samples
Stars 5: 200274 samples
