In [6]:
import pandas as pd
import numpy as np
import os
import json
import gc
import re
from tqdm import tqdm
from google.colab import drive
from sklearn.model_selection import train_test_split

# Mount Google Drive (safely)
try:
    # Check if already mounted
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')

    DRIVE_PATH = "/content/drive/MyDrive/amazon_reviews_backup"
    SAMPLE_PATH = f"{DRIVE_PATH}/sampled_data_3percent"

    # Create directories if they don't exist
    os.makedirs(DRIVE_PATH, exist_ok=True)
    os.makedirs(SAMPLE_PATH, exist_ok=True)

    print("\n" + "="*50)
    print("✓ Google Drive mounted successfully")
    print("✓ Backup directory:", DRIVE_PATH)
    print("✓ Sample directory:", SAMPLE_PATH)

    # Verify files are accessible
    parquet_files = [f for f in os.listdir(DRIVE_PATH) if f.endswith('.parquet')]
    print(f"Found {len(parquet_files)} parquet files to process")
    print("="*50 + "\n")

except Exception as e:
    print("✗ Error mounting Google Drive:", str(e))
    raise

# Progress tracking functions
def save_sampling_progress(processed_files):
    """Save sampling progress to a JSON file"""
    progress_file = f"{SAMPLE_PATH}/sampling_progress.json"
    with open(progress_file, 'w') as f:
        json.dump({
            'processed_files': processed_files,
            'total_processed': len(processed_files)
        }, f)

def load_sampling_progress():
    """Load previous sampling progress if it exists"""
    progress_file = f"{SAMPLE_PATH}/sampling_progress.json"
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            return json.load(f)
    return {'processed_files': [], 'total_processed': 0}

def save_combination_progress(processed_files, total_rows):
    """Save combination progress"""
    progress_file = f"{SAMPLE_PATH}/combination_progress.json"
    with open(progress_file, 'w') as f:
        json.dump({
            'processed_files': processed_files,
            'total_rows': total_rows
        }, f)

def load_combination_progress():
    """Load combination progress if exists"""
    progress_file = f"{SAMPLE_PATH}/combination_progress.json"
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            return json.load(f)
    return {'processed_files': [], 'total_rows': 0}

def check_existing_splits():
    """Check if splits already exist"""
    split_files = ['train_raw.parquet', 'val_raw.parquet', 'test_raw.parquet']
    existing_splits = [f for f in split_files if os.path.exists(f"{SAMPLE_PATH}/{f}")]
    if existing_splits:
        print("\nFound existing splits:")
        for split in existing_splits:
            df = pd.read_parquet(f"{SAMPLE_PATH}/{split}")
            print(f"{split}: {len(df):,} reviews")
        return True
    return False

def create_sample_dataset(sample_fraction=0.03):
    """Create a 3% sample dataset with progress tracking"""

    # Check if combined file already exists
    combined_file = f"{SAMPLE_PATH}/amazon_reviews_sampled_combined.parquet"
    if os.path.exists(combined_file):
        print(f"\nFound existing combined file: {combined_file}")
        response = input("Would you like to (K)eep it or create (N)ew sample? [K/N]: ").lower()
        if response == 'k':
            return pd.read_parquet(combined_file)

    # Create sample directory if it doesn't exist
    os.makedirs(SAMPLE_PATH, exist_ok=True)

    # Get all source files
    files = sorted([f for f in os.listdir(DRIVE_PATH)
                   if f.endswith('.parquet') and 'sampled' not in f])

    # Load previous progress
    progress = load_sampling_progress()
    processed_files = progress['processed_files']

    if processed_files:
        print(f"\nFound previous progress: {len(processed_files)}/{len(files)} files processed")
        response = input("Would you like to (R)esume, (S)tart over, or (C)ancel? [R/S/C]: ").lower()
        if response == 'c':
            return None
        elif response == 's':
            processed_files = []
            # Clear the sampled directory except progress file
            for f in os.listdir(SAMPLE_PATH):
                if f != 'sampling_progress.json':
                    os.remove(f"{SAMPLE_PATH}/{f}")

    print(f"\nFound {len(files)} category files")
    print(f"Will process {len(files) - len(processed_files)} remaining files")
    total_sampled = 0

    try:
        for file in tqdm(files, desc="Sampling categories"):
            if file in processed_files:
                print(f"\nSkipping {file} - already processed")
                continue

            try:
                print(f"\nProcessing: {file}")

                # Read and sample the file
                df = pd.read_parquet(f'{DRIVE_PATH}/{file}')
                original_size = len(df)
                sample_size = int(original_size * sample_fraction)

                sampled_df = df.sample(n=sample_size)

                # Save sampled data
                output_file = f'{SAMPLE_PATH}/{file}'
                sampled_df.to_parquet(output_file)

                # Print stats
                file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
                print(f"Original reviews: {original_size:,}")
                print(f"Sampled reviews: {sample_size:,}")
                print(f"File size: {file_size_mb:.2f}MB")

                total_sampled += sample_size

                # Update progress
                processed_files.append(file)
                save_sampling_progress(processed_files)

                del df, sampled_df
                gc.collect()

            except Exception as e:
                print(f"Error with {file}: {str(e)}")
                continue

        return total_sampled

    except Exception as e:
        print(f"\nError during processing: {str(e)}")
        print("Progress has been saved - you can resume later")
        return None

def combine_sampled_files():
    """Combine all sampled files with progress tracking"""

    # Check if combined file already exists
    combined_file = f"{SAMPLE_PATH}/amazon_reviews_sampled_combined.parquet"
    if os.path.exists(combined_file):
        print(f"\nFound existing combined file: {combined_file}")
        df = pd.read_parquet(combined_file)
        print(f"Contains {len(df):,} reviews")
        response = input("Would you like to (K)eep it or create (N)ew combined file? [K/N]: ").lower()
        if response == 'k':
            return df

    # Get all sampled files
    files = sorted([f for f in os.listdir(SAMPLE_PATH)
                   if f.endswith('.parquet') and 'combined' not in f])

    print(f"\nFound {len(files)} sampled files to combine")

    # Check for existing progress
    progress = load_combination_progress()
    processed_files = progress['processed_files']
    total_rows = progress['total_rows']

    if processed_files:
        print(f"\nFound previous progress: {len(processed_files)}/{len(files)} files combined")
        print(f"Current combined rows: {total_rows:,}")
        response = input("Would you like to (R)esume, (S)tart over, or (C)ancel? [R/S/C]: ").lower()
        if response == 'c':
            return None
        elif response == 's':
            processed_files = []
            total_rows = 0
            if os.path.exists(combined_file):
                os.remove(combined_file)

    try:
        # If resuming, load existing combined file
        if processed_files and os.path.exists(combined_file):
            print("\nLoading existing combined file...")
            combined_df = pd.read_parquet(combined_file)
        else:
            combined_df = None

        # Process remaining files
        remaining_files = [f for f in files if f not in processed_files]
        print(f"\nProcessing {len(remaining_files)} remaining files...")

        for file in tqdm(remaining_files, desc="Combining files"):
            try:
                print(f"\nProcessing: {file}")

                # Read file
                df = pd.read_parquet(f"{SAMPLE_PATH}/{file}")
                print(f"File size: {len(df):,} reviews")

                # Combine
                if combined_df is None:
                    combined_df = df
                else:
                    combined_df = pd.concat([combined_df, df], ignore_index=True)

                total_rows = len(combined_df)
                print(f"Combined size: {total_rows:,} reviews")

                # Save progress
                processed_files.append(file)
                save_combination_progress(processed_files, total_rows)

                # Save combined file after each successful addition
                combined_df.to_parquet(combined_file)
                print(f"Saved combined file: {total_rows:,} reviews")

                del df
                gc.collect()

            except Exception as e:
                print(f"Error processing {file}: {str(e)}")
                print("Progress saved - you can resume from here")
                return None

        return combined_df

    except Exception as e:
        print(f"\nError during combination: {str(e)}")
        print("Progress saved - you can resume later")
        return None

def create_data_splits(test_size=0.1, val_size=0.1, random_state=42):
    """Create train/val/test splits with checks for existing splits"""

    # Check for existing splits
    if check_existing_splits():
        response = input("\nWould you like to (K)eep existing splits or create (N)ew ones? [K/N]: ").lower()
        if response == 'k':
            return (
                pd.read_parquet(f"{SAMPLE_PATH}/train_raw.parquet"),
                pd.read_parquet(f"{SAMPLE_PATH}/val_raw.parquet"),
                pd.read_parquet(f"{SAMPLE_PATH}/test_raw.parquet")
            )

    print("\nCreating new splits...")

    # Load combined dataset
    combined_file = f"{SAMPLE_PATH}/amazon_reviews_sampled_combined.parquet"
    if not os.path.exists(combined_file):
        print("Error: Combined dataset not found. Please run combine_sampled_files() first.")
        return None

    df = pd.read_parquet(combined_file)
    print(f"Initial dataset size: {len(df):,}")

    # First split
    train_val, test = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['rating']
    )
    print(f"\nAfter first split:")
    print(f"Train_val size: {len(train_val):,}")
    print(f"Test size: {len(test):,}")
    print(f"Total: {len(train_val) + len(test):,}")

    # Second split
    val_size_adjusted = val_size / (1 - test_size)
    train, val = train_test_split(
        train_val,
        test_size=val_size_adjusted,
        random_state=random_state,
        stratify=train_val['rating']
    )
    print(f"\nAfter second split:")
    print(f"Train size: {len(train):,}")
    print(f"Val size: {len(val):,}")
    print(f"Test size: {len(test):,}")
    print(f"Total: {len(train) + len(val) + len(test):,}")

    # Save splits with size verification
    print("\nSaving splits...")
    for name, data in [('train', train), ('val', val), ('test', test)]:
        output_file = f"{SAMPLE_PATH}/{name}_raw.parquet"
        before_save = len(data)
        data.to_parquet(output_file)
        after_save = len(pd.read_parquet(output_file))
        print(f"{name}: {before_save:,} → {after_save:,} reviews")

    return train, val, test

def verify_splits():
    """Verify the integrity of raw splits"""
    print("\nVerifying splits...")
    splits = {
        'train': 'train_raw.parquet',
        'val': 'val_raw.parquet',
        'test': 'test_raw.parquet'
    }

    all_valid = True
    total_reviews = 0

    for split_name, filename in splits.items():
        path = f"{SAMPLE_PATH}/{filename}"
        if os.path.exists(path):
            try:
                df = pd.read_parquet(path)
                num_reviews = len(df)
                total_reviews += num_reviews
                print(f"✓ {split_name}: {num_reviews:,} reviews")

                # Only verify presence of required columns for raw data
                expected_cols = {'text', 'rating', 'category'}
                missing_cols = expected_cols - set(df.columns)
                if missing_cols:
                    print(f"✗ {split_name}: Missing columns: {missing_cols}")
                    all_valid = False

            except Exception as e:
                print(f"✗ Error reading {split_name}: {str(e)}")
                all_valid = False
        else:
            print(f"✗ Missing {split_name} split")
            all_valid = False

    if all_valid:
        print(f"\n✓ All splits verified successfully")
        print(f"Total reviews across splits: {total_reviews:,}")

    return all_valid

def preprocess_dataset():
    """Preprocess the existing train/val/test splits"""

    def check_existing_processed_splits():
        """Check if processed splits already exist"""
        files = ['train_processed.parquet', 'val_processed.parquet', 'test_processed.parquet']
        existing = all(os.path.exists(f"{SAMPLE_PATH}/{f}") for f in files)
        if existing:
            print("\nFound existing processed splits:")
            for f in files:
                size = len(pd.read_parquet(f"{SAMPLE_PATH}/{f}"))
                print(f"{f}: {size:,} reviews")
        return existing

    if check_existing_processed_splits():
        response = input("\nWould you like to (K)eep existing processed splits or create (N)ew ones? [K/N]: ").lower()
        if response == 'k':
            return {
                'train': pd.read_parquet(f"{SAMPLE_PATH}/train_processed.parquet"),
                'val': pd.read_parquet(f"{SAMPLE_PATH}/val_processed.parquet"),
                'test': pd.read_parquet(f"{SAMPLE_PATH}/test_processed.parquet")
            }

    def preprocess_text(text):
        """Preprocess review text"""
        # Convert to string
        text = str(text)

        # Remove HTML
        text = re.sub(r'<[^>]+>', ' ', text)

        # Replace URLs and emails
        text = re.sub(r'http[s]?://\S+', '[URL]', text)
        text = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL]', text)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)
        text = text.strip()

        return text

    # Process each split
    splits = ['train', 'val', 'test']
    processed_splits = {}
    total_initial = 0
    total_final = 0

    for split_name in splits:
        print(f"\nLoading {split_name} split...")
        raw_file = f"{SAMPLE_PATH}/{split_name}_raw.parquet"

        if not os.path.exists(raw_file):
            print(f"Error: {raw_file} not found!")
            continue

        df = pd.read_parquet(raw_file)
        total_initial += len(df)

        print(f"Processing {split_name} split...")
        print(f"Initial size: {len(df):,}")

        # Remove invalid data
        if split_name == 'train':  # Only train has 0-ratings
            df = df[df['rating'] > 0]
        df = df[df['text'].notna()]  # Remove null text
        df = df[df['text'].str.len() > 4]  # Remove very short text

        # Remove duplicates
        df = df.drop_duplicates(subset=['text', 'rating', 'category'], keep='first')

        # Add unknown category token
        df.loc[df['category'] == 'Unknown', 'category'] = '[UNKNOWN]'

        # Preprocess text
        print("Preprocessing text...")
        df['processed_text'] = df['text'].progress_apply(preprocess_text)

        # Remove empty text after preprocessing
        df = df[df['processed_text'].str.len() > 0]

        # Add sentiment labels
        df['sentiment'] = df['rating'].map({
            1: 'Negative',
            2: 'Negative',
            3: 'Neutral',
            4: 'Positive',
            5: 'Positive'
        })

        print(f"Final size: {len(df):,}")
        print("Final rating distribution:")
        print(df['rating'].value_counts(normalize=True).sort_index())
        print("\nSentiment distribution:")
        print(df['sentiment'].value_counts(normalize=True))

        # Save processed split
        output_file = f"{SAMPLE_PATH}/{split_name}_processed.parquet"
        df.to_parquet(output_file)
        print(f"Saved to: {output_file}")

        processed_splits[split_name] = df
        total_final += len(df)

        del df
        gc.collect()

    # Print final statistics
    print("\nFinal Statistics:")
    print(f"Total reviews before preprocessing: {total_initial:,}")
    print(f"Total reviews after preprocessing: {total_final:,}")
    print(f"Total reviews removed: {total_initial - total_final:,} ({(total_initial - total_final)/total_initial*100:.1f}%)")

    return processed_splits

# Enable tqdm for pandas
tqdm.pandas()

# Run the pipeline
if __name__ == "__main__":
    if input("\nRun data preparation pipeline? (yes/no): ").lower() == 'yes':
        try:
            # Step 1: Create sample dataset
            print("\nStep 1: Creating sample dataset...")
            total_sampled = create_sample_dataset()

            if total_sampled is not None:
                # Step 2: Combine sampled files
                print("\nStep 2: Combining sampled files...")
                combined_df = combine_sampled_files()

                if combined_df is not None:
                    # Step 3: Create splits
                    print("\nStep 3: Creating train/val/test splits...")
                    train, val, test = create_data_splits()

                    # Step 4: Verify splits
                    if not verify_splits():
                        print("\n⚠️ Warning: Some splits may be incomplete or corrupted")
                        print("Please check the issues above and resolve them before proceeding")
                    else:
                        # Add Step 5: Preprocess splits
                        print("\nStep 5: Preprocessing splits...")
                        processed_splits = preprocess_dataset()
                        print("\n🎉 Data preparation completed successfully!")

        except KeyboardInterrupt:
            print("\n\nProcess interrupted by user")
        except Exception as e:
            print(f"\n\nError during processing: {str(e)}")
        finally:
            print("\nCleaning up memory...")
            gc.collect()


✓ Google Drive mounted successfully
✓ Backup directory: /content/drive/MyDrive/amazon_reviews_backup
✓ Sample directory: /content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent
Found 34 parquet files to process


Run data preparation pipeline? (yes/no): yes

Step 1: Creating sample dataset...

Found existing combined file: /content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent/amazon_reviews_sampled_combined.parquet
Would you like to (K)eep it or create (N)ew sample? [K/N]: K

Step 2: Combining sampled files...

Found existing combined file: /content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent/amazon_reviews_sampled_combined.parquet
Contains 16,681,866 reviews
Would you like to (K)eep it or create (N)ew combined file? [K/N]: K

Step 3: Creating train/val/test splits...

Found existing splits:
train_raw.parquet: 13,345,492 reviews
val_raw.parquet: 1,668,187 reviews
test_raw.parquet: 1,668,187 reviews

Would you like to (K)eep existing splits or crea

100%|██████████| 11896753/11896753 [06:27<00:00, 30678.75it/s]


Final size: 11,896,753
Final rating distribution:
rating
1.0    0.114896
2.0    0.049886
3.0    0.070137
4.0    0.124744
5.0    0.640337
Name: proportion, dtype: float64

Sentiment distribution:
sentiment
Positive    0.765081
Negative    0.164782
Neutral     0.070137
Name: proportion, dtype: float64
Saved to: /content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent/train_processed.parquet

Loading val split...
Processing val split...
Initial size: 1,668,187
Preprocessing text...


100%|██████████| 1573506/1573506 [00:52<00:00, 29743.73it/s]


Final size: 1,573,506
Final rating distribution:
rating
1.0    0.110005
2.0    0.050611
3.0    0.071405
4.0    0.127973
5.0    0.640005
Name: proportion, dtype: float64

Sentiment distribution:
sentiment
Positive    0.767979
Negative    0.160616
Neutral     0.071405
Name: proportion, dtype: float64
Saved to: /content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent/val_processed.parquet

Loading test split...
Processing test split...
Initial size: 1,668,187
Preprocessing text...


100%|██████████| 1508019/1508019 [00:48<00:00, 30801.46it/s]


Final size: 1,508,019
Final rating distribution:
rating
1.0    0.104980
2.0    0.052796
3.0    0.074464
4.0    0.133478
5.0    0.634282
Name: proportion, dtype: float64

Sentiment distribution:
sentiment
Positive    0.767760
Negative    0.157776
Neutral     0.074464
Name: proportion, dtype: float64
Saved to: /content/drive/MyDrive/amazon_reviews_backup/sampled_data_3percent/test_processed.parquet

Final Statistics:
Total reviews before preprocessing: 16,681,866
Total reviews after preprocessing: 14,978,278
Total reviews removed: 1,703,588 (10.2%)

🎉 Data preparation completed successfully!

Cleaning up memory...
