In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from data_preprocessing import NusaXSentimentDataProcessor


In [11]:
def analyze_data(x_train, y_train, x_val, y_val, x_test, y_test, data_processor):
    """
    Analyze and visualize the preprocessed data
    """
    print("\n===== Data Analysis =====")
    
    print("\n--- Data Shapes ---")
    print(f"Training data: {x_train.shape}")
    print(f"Training labels: {y_train.shape}")
    print(f"Validation data: {x_val.shape}")
    print(f"Validation labels: {y_val.shape}")
    print(f"Test data: {x_test.shape}")
    print(f"Test labels: {y_test.shape}")
    
    print("\n--- Label Mapping ---")
    print("Label mapping:")
    for label, idx in data_processor.label_mapping.items():
        print(f"  {label} -> {idx}")
    
    print("\n--- Label Distribution ---")
    train_label_dist = np.bincount(y_train.astype(int))
    val_label_dist = np.bincount(y_val.astype(int))
    test_label_dist = np.bincount(y_test.astype(int))
    
    idx_to_label = {v: k for k, v in data_processor.label_mapping.items()}
    
    for i in range(len(train_label_dist)):
        label_name = idx_to_label.get(i, f"Unknown-{i}")
        print(f"Class {i} ({label_name}): Train={train_label_dist[i]} ({train_label_dist[i]/len(y_train):.2%}), "
              f"Val={val_label_dist[i] if i < len(val_label_dist) else 0} "
              f"({val_label_dist[i]/len(y_val):.2%} if i < len(val_label_dist) else 0.0), "
              f"Test={test_label_dist[i] if i < len(test_label_dist) else 0} "
              f"({test_label_dist[i]/len(y_test):.2%} if i < len(test_label_dist) else 0.0)")
    

    
    print("\n--- Vocabulary Information ---")
    vocab_size = data_processor.get_vocabulary_size()
    print(f"Vocabulary size: {vocab_size}")
    
    vocab = data_processor.vectorize_layer.get_vocabulary()
    print("Top 20 vocabulary items:")
    for i, word in enumerate(vocab[:20]):
        print(f"  {i}: {word}")
    
    print("\n--- Sample Data ---")
    num_samples = min(5, len(x_train))
    
    for i in range(num_samples):
        print(f"\nSample {i+1}:")
        label_idx = int(y_train[i])
        label_name = idx_to_label.get(label_idx, f"Unknown-{label_idx}")
        print(f"Label: {label_idx} ({label_name})")
        tokens = x_train[i]
        non_zero_tokens = tokens[tokens > 0]
        print(f"Sequence length: {len(non_zero_tokens)}")
        print("Tokens:", non_zero_tokens[:10], "..." if len(non_zero_tokens) > 10 else "")
        if len(vocab) > 1:
            print("First few tokens decoded:")
            for token_idx in non_zero_tokens[:10]:
                if token_idx < len(vocab):
                    print(f"  {token_idx}: {vocab[token_idx]}")
    

    


In [12]:
data_dir = '../indonesian'

data_processor = NusaXSentimentDataProcessor(data_dir)
print("Preparing data...")
(x_train, y_train), (x_val, y_val), (x_test, y_test) = data_processor.prepare_data()

analyze_data(x_train, y_train, x_val, y_val, x_test, y_test, data_processor)


Preparing data...
Unique labels found: {'negative', 'positive', 'neutral'}
Train data: 500 samples
Validation data: 100 samples
Test data: 400 samples

===== Data Analysis =====

--- Data Shapes ---
Training data: (500, 100)
Training labels: (500,)
Validation data: (100, 100)
Validation labels: (100,)
Test data: (400, 100)
Test labels: (400,)

--- Label Mapping ---
Label mapping:
  negative -> 0
  neutral -> 1
  positive -> 2

--- Label Distribution ---
Class 0 (negative): Train=192 (38.40%), Val=38 (38.00% if i < len(val_label_dist) else 0.0), Test=153 (38.25% if i < len(test_label_dist) else 0.0)
Class 1 (neutral): Train=119 (23.80%), Val=24 (24.00% if i < len(val_label_dist) else 0.0), Test=96 (24.00% if i < len(test_label_dist) else 0.0)
Class 2 (positive): Train=189 (37.80%), Val=38 (38.00% if i < len(val_label_dist) else 0.0), Test=151 (37.75% if i < len(test_label_dist) else 0.0)

--- Vocabulary Information ---
Vocabulary size: 2836
Top 20 vocabulary items:
  0: 
  1: [UNK]
  2: