## Amazon Sentiment Analysis - Preprocessing
Read from .bz2 FastText format and convert to CSV

In [1]:
# Import necessary libraries
import os
import pandas as pd
import bz2
from tqdm import tqdm

In [2]:
# Set path to raw and processed data
RAW_DIR = "../data/raw"
PROCESSED_DIR = "../data/processed"

In [3]:
os.makedirs(PROCESSED_DIR, exist_ok=True)

## Function to load FastText-format bz2 file
Format: __label__1 This product is terrible

In [4]:
def load_bz2_fasttext_file(filepath, max_lines=None):
    texts, labels = [], []
    with bz2.open(filepath, mode='rt', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f, desc=f"Reading {os.path.basename(filepath)}")):
            if max_lines and i >= max_lines:
                break
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                label, text = parts
                labels.append(int(label.replace('__label__', '')))
                texts.append(text)
    return pd.DataFrame({'label': labels, 'text': texts})

In [5]:
# Load train and test datasets
train_path = os.path.join(RAW_DIR, "train.ft.txt.bz2")
test_path = os.path.join(RAW_DIR, "test.ft.txt.bz2")

In [6]:
# You can set max_lines=100000 for faster testing
df_train = load_bz2_fasttext_file(train_path)
df_test = load_bz2_fasttext_file(test_path)

Reading train.ft.txt.bz2: 3600000it [00:38, 92851.40it/s]
Reading test.ft.txt.bz2: 400000it [00:04, 86515.23it/s]


In [7]:
# Preview
print("Train sample:\n", df_train.head())
print("Test sample:\n", df_test.head())

Train sample:
    label                                               text
0      2  Stuning even for the non-gamer: This sound tra...
1      2  The best soundtrack ever to anything.: I'm rea...
2      2  Amazing!: This soundtrack is my favorite music...
3      2  Excellent Soundtrack: I truly like this soundt...
4      2  Remember, Pull Your Jaw Off The Floor After He...
Test sample:
    label                                               text
0      2  Great CD: My lovely Pat has one of the GREAT v...
1      2  One of the best game music soundtracks - for a...
2      1  Batteries died within a year ...: I bought thi...
3      2  works fine, but Maha Energy is better: Check o...
4      2  Great for the non-audiophile: Reviewed quite a...


In [8]:
# Save to CSV in processed folder
train_csv_path = os.path.join(PROCESSED_DIR, "train.csv")
test_csv_path = os.path.join(PROCESSED_DIR, "test.csv")

In [9]:

df_train.to_csv(train_csv_path, index=False)
df_test.to_csv(test_csv_path, index=False)

In [10]:

print(f"Saved processed files:\n - {train_csv_path}\n - {test_csv_path}")

Saved processed files:
 - ../data/processed\train.csv
 - ../data/processed\test.csv
