# **Loading and Preprocessing Data with TensorFlow**

In [3]:
# TensorFlow Data Loading dan Preprocessing Demo - FIXED VERSION
# Demonstrasi lengkap untuk Data API, TFRecord Format, dan Feature Preprocessing

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.datasets import fetch_california_housing
import os
import json

print("TensorFlow Version:", tf.__version__)

# ============================
# 1. DATA PREPARATION
# ============================

def prepare_sample_data():
    """Menyiapkan data sampel dengan categorical dan numerical features"""
    print("\n=== PREPARING SAMPLE DATA ===")

    # Load California housing dataset
    housing = fetch_california_housing()

    # Create synthetic categorical features untuk demo
    np.random.seed(42)
    n_samples = len(housing.data)

    # Categorical features
    house_type = np.random.choice(['apartment', 'house', 'condo', 'townhouse'], n_samples)
    location_type = np.random.choice(['urban', 'suburban', 'rural'], n_samples)

    # Create DataFrame
    feature_names = list(housing.feature_names) + ['house_type', 'location_type']
    data = np.column_stack([housing.data, house_type, location_type])

    df = pd.DataFrame(data, columns=feature_names)
    df['target'] = housing.target

    # Convert numerical columns back to float
    for col in housing.feature_names:
        df[col] = df[col].astype(float)

    print(f"Dataset shape: {df.shape}")
    print(f"Features: {feature_names}")
    print("\nFirst 5 rows:")
    print(df.head())

    return df, list(housing.feature_names)

# ============================
# 2. CSV DATA PIPELINE
# ============================

def create_csv_pipeline(df, numerical_features):
    """Membuat pipeline untuk membaca data dari CSV files"""
    print("\n=== CREATING CSV PIPELINE ===")

    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

    # Save to CSV files
    os.makedirs('data', exist_ok=True)
    train_df.to_csv('data/train.csv', index=False)
    val_df.to_csv('data/val.csv', index=False)
    test_df.to_csv('data/test.csv', index=False)

    print(f"Train size: {len(train_df)}")
    print(f"Validation size: {len(val_df)}")
    print(f"Test size: {len(test_df)}")

    return train_df, val_df, test_df

def parse_csv_line(line, feature_names, categorical_features):
    """Parse CSV line dengan handling categorical dan numerical features"""
    # Define defaults for each column
    defaults = []
    for col in feature_names + ['target']:
        if col in categorical_features:
            defaults.append('')  # String default
        else:
            defaults.append(0.0)  # Float default

    # Parse CSV
    fields = tf.io.decode_csv(line, record_defaults=defaults)

    # Separate features and target
    features = {}
    for i, col in enumerate(feature_names):
        features[col] = fields[i]

    target = fields[-1]
    return features, target

def create_csv_dataset(filepath, feature_names, categorical_features, batch_size=32):
    """Membuat dataset dari CSV file"""
    dataset = tf.data.TextLineDataset(filepath)
    dataset = dataset.skip(1)  # Skip header
    dataset = dataset.map(
        lambda line: parse_csv_line(line, feature_names, categorical_features),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    dataset = dataset.shuffle(1000)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

# ============================
# 3. TFRECORD FORMAT
# ============================

def create_tfrecord_example(row, feature_names, categorical_features):
    """Membuat TFRecord example dari satu baris data"""
    features = {}

    for col in feature_names:
        if col in categorical_features:
            # Categorical feature as bytes
            features[col] = tf.train.Feature(
                bytes_list=tf.train.BytesList(value=[str(row[col]).encode('utf-8')])
            )
        else:
            # Numerical feature as float
            features[col] = tf.train.Feature(
                float_list=tf.train.FloatList(value=[float(row[col])])
            )

    # Target
    features['target'] = tf.train.Feature(
        float_list=tf.train.FloatList(value=[float(row['target'])])
    )

    example = tf.train.Example(
        features=tf.train.Features(feature=features)
    )
    return example

def write_tfrecords(df, filename, feature_names, categorical_features):
    """Menulis DataFrame ke TFRecord file"""
    print(f"\nWriting {len(df)} examples to {filename}")

    with tf.io.TFRecordWriter(filename) as writer:
        for _, row in df.iterrows():
            example = create_tfrecord_example(row, feature_names, categorical_features)
            writer.write(example.SerializeToString())

def parse_tfrecord_example(example_proto, feature_names, categorical_features):
    """Parse TFRecord example"""
    feature_description = {}

    for col in feature_names:
        if col in categorical_features:
            feature_description[col] = tf.io.FixedLenFeature([], tf.string)
        else:
            feature_description[col] = tf.io.FixedLenFeature([], tf.float32)

    feature_description['target'] = tf.io.FixedLenFeature([], tf.float32)

    parsed_features = tf.io.parse_single_example(example_proto, feature_description)

    # Separate features and target
    features = {k: v for k, v in parsed_features.items() if k != 'target'}
    target = parsed_features['target']

    return features, target

def create_tfrecord_dataset(filepath, feature_names, categorical_features, batch_size=32):
    """Membuat dataset dari TFRecord file"""
    dataset = tf.data.TFRecordDataset(filepath)
    dataset = dataset.map(
        lambda x: parse_tfrecord_example(x, feature_names, categorical_features),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    dataset = dataset.shuffle(1000)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

# ============================
# 4. PREPROCESSING LAYERS
# ============================

def create_preprocessing_layers(train_df, feature_names, categorical_features, numerical_features):
    """Membuat preprocessing layers untuk berbagai jenis features"""
    print("\n=== CREATING PREPROCESSING LAYERS ===")

    preprocessing_layers = {}

    # Numerical features - Normalization
    for col in numerical_features:
        # Calculate mean and std for normalization
        mean_val = train_df[col].mean()
        std_val = train_df[col].std()
        preprocessing_layers[f'{col}_norm'] = (mean_val, std_val)

    # Categorical features - Vocabularies
    vocabularies = {}
    for col in categorical_features:
        vocab = sorted(train_df[col].unique())
        vocabularies[col] = vocab
        print(f"{col} vocabulary: {vocab}")

    return preprocessing_layers, vocabularies

# ============================
# 5. MODEL BUILDING
# ============================

def create_model_with_preprocessing(train_df, vocabularies, numerical_features, categorical_features, use_embedding=True):
    """Membuat model dengan preprocessing terintegrasi"""
    print(f"\n=== CREATING MODEL (Embedding: {use_embedding}) ===")

    # Input layers
    inputs = {}
    for col in numerical_features:
        inputs[col] = keras.layers.Input(shape=(), name=col, dtype=tf.float32)

    for col in categorical_features:
        inputs[col] = keras.layers.Input(shape=(), name=col, dtype=tf.string)

    # Preprocessing
    preprocessed_features = []

    # Numerical features - normalization
    for col in numerical_features:
        # Create normalization layer
        normalizer = keras.layers.Normalization(axis=None)
        normalizer.adapt(train_df[col].values.reshape(-1, 1))
        normalized = normalizer(inputs[col])

        # FIXED: Ensure consistent shape by expanding dimensions
        normalized = keras.layers.Reshape((1,))(normalized)
        preprocessed_features.append(normalized)

    # Categorical features
    for col in categorical_features:
        vocab = vocabularies[col]

        # FIXED: Use keras.layers.StringLookup instead of keras.utils.StringLookup
        lookup = keras.layers.StringLookup(vocabulary=vocab, output_mode='int')
        categorical_encoded = lookup(inputs[col])

        if use_embedding:
            # Embedding
            embedding_dim = min(50, len(vocab) // 2 + 1)
            embedded = keras.layers.Embedding(
                input_dim=len(vocab) + 1,
                output_dim=embedding_dim,
                name=f'{col}_embedding'
            )(categorical_encoded)
            # FIXED: Flatten the embedding to ensure consistent shape
            embedded = keras.layers.Flatten()(embedded)
            preprocessed_features.append(embedded)
        else:
            # One-hot encoding
            one_hot = keras.layers.CategoryEncoding(
                num_tokens=len(vocab) + 1,
                output_mode='one_hot'
            )(categorical_encoded)
            # FIXED: Ensure consistent shape for one-hot encoded features
            one_hot = keras.layers.Flatten()(one_hot)
            preprocessed_features.append(one_hot)

    # Combine all features
    if len(preprocessed_features) > 1:
        combined = keras.layers.concatenate(preprocessed_features)
    else:
        combined = preprocessed_features[0]

    # Dense layers
    x = keras.layers.Dense(128, activation='relu')(combined)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(64, activation='relu')(x)
    x = keras.layers.Dropout(0.2)(x)
    output = keras.layers.Dense(1, name='output')(x)

    model = keras.Model(inputs=inputs, outputs=output)
    return model

# ============================
# 6. ALTERNATIVE APPROACH WITH FEATURE COLUMNS (LEGACY)
# ============================

def create_feature_columns(vocabularies, numerical_features, categorical_features, use_embedding=True):
    """Membuat feature columns untuk model (legacy approach)"""
    print(f"\n=== CREATING FEATURE COLUMNS (Embedding: {use_embedding}) ===")

    feature_columns = []

    # Numerical features
    for col in numerical_features:
        feature_columns.append(tf.feature_column.numeric_column(col))

    # Categorical features
    for col in categorical_features:
        vocab = vocabularies[col]
        categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(
            col, vocab
        )

        if use_embedding:
            # Use embedding for categorical features
            embedding_dim = min(50, len(vocab) // 2 + 1)
            feature_columns.append(
                tf.feature_column.embedding_column(categorical_col, embedding_dim)
            )
        else:
            # Use one-hot encoding
            feature_columns.append(
                tf.feature_column.indicator_column(categorical_col)
            )

    return feature_columns

def create_model_with_feature_columns(feature_columns):
    """Membuat model menggunakan feature columns (legacy approach)"""
    print("\n=== CREATING MODEL WITH FEATURE COLUMNS ===")

    # Create DenseFeatures layer
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

    # Build model
    model = keras.Sequential([
        feature_layer,
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1)
    ])

    return model

# ============================
# 7. MAIN DEMO FUNCTION
# ============================

def run_demo():
    """Menjalankan demo lengkap"""
    print("="*60)
    print("TENSORFLOW DATA LOADING & PREPROCESSING DEMO")
    print("="*60)

    # 1. Prepare data
    df, numerical_features = prepare_sample_data()
    categorical_features = ['house_type', 'location_type']
    feature_names = numerical_features + categorical_features

    # 2. Create CSV pipeline
    train_df, val_df, test_df = create_csv_pipeline(df, numerical_features)

    # 3. Create TFRecord files
    print("\n=== CREATING TFRECORD FILES ===")
    os.makedirs('data/tfrecords', exist_ok=True)
    write_tfrecords(train_df, 'data/tfrecords/train.tfrecord', feature_names, categorical_features)
    write_tfrecords(val_df, 'data/tfrecords/val.tfrecord', feature_names, categorical_features)
    write_tfrecords(test_df, 'data/tfrecords/test.tfrecord', feature_names, categorical_features)

    # 4. Create datasets
    print("\n=== CREATING DATASETS ===")

    # CSV datasets
    train_csv_ds = create_csv_dataset('data/train.csv', feature_names, categorical_features)
    val_csv_ds = create_csv_dataset('data/val.csv', feature_names, categorical_features)

    # TFRecord datasets
    train_tfr_ds = create_tfrecord_dataset('data/tfrecords/train.tfrecord', feature_names, categorical_features)
    val_tfr_ds = create_tfrecord_dataset('data/tfrecords/val.tfrecord', feature_names, categorical_features)

    # 5. Create preprocessing
    preprocessing_layers, vocabularies = create_preprocessing_layers(
        train_df, feature_names, categorical_features, numerical_features
    )

    # 6. Test both approaches
    for use_embedding in [True, False]:
        approach = "Embedding" if use_embedding else "One-Hot"
        print(f"\n{'='*20} {approach.upper()} APPROACH {'='*20}")

        # Create and compile model
        model = create_model_with_preprocessing(
            train_df, vocabularies, numerical_features, categorical_features, use_embedding
        )

        model.compile(
            optimizer='adam',
            loss='mse',
            metrics=['mae']
        )

        print(f"\nModel Summary ({approach}):")
        model.summary()

        # Train for few epochs (demo purpose)
        print(f"\nTraining model with {approach}...")
        try:
            history = model.fit(
                train_csv_ds,
                validation_data=val_csv_ds,
                epochs=2,
                verbose=1
            )

            # Test with TFRecord data
            print(f"\nEvaluating on TFRecord data...")
            test_loss, test_mae = model.evaluate(val_tfr_ds, verbose=0)
            print(f"Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}")
        except Exception as e:
            print(f"Training failed: {e}")

    # 7. Demonstrate data inspection
    print("\n=== DATA INSPECTION ===")
    print("\nCSV Dataset sample:")
    for batch in train_csv_ds.take(1):
        features, targets = batch
        print(f"Batch size: {len(targets)}")
        print(f"Feature keys: {list(features.keys())}")
        for key, value in features.items():
            print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
            if key in categorical_features:
                print(f"    Sample values: {value.numpy()[:3]}")
        break

    print("\nTFRecord Dataset sample:")
    for batch in train_tfr_ds.take(1):
        features, targets = batch
        print(f"Batch size: {len(targets)}")
        print(f"Feature keys: {list(features.keys())}")
        for key, value in features.items():
            print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
        break

    # 8. Demonstrate Feature Columns approach (legacy)
    print("\n=== FEATURE COLUMNS APPROACH (LEGACY) ===")
    try:
        feature_columns = create_feature_columns(
            vocabularies, numerical_features, categorical_features, use_embedding=True
        )
        fc_model = create_model_with_feature_columns(feature_columns)
        fc_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
        print("Feature Columns model created successfully!")
        fc_model.summary()
    except Exception as e:
        print(f"Feature Columns approach failed: {e}")

    print("\n" + "="*60)
    print("DEMO COMPLETED!")
    print("="*60)

    return {
        'train_csv_ds': train_csv_ds,
        'train_tfr_ds': train_tfr_ds,
        'vocabularies': vocabularies,
        'feature_names': feature_names,
        'categorical_features': categorical_features,
        'numerical_features': numerical_features
    }

# Run the demo
if __name__ == "__main__":
    results = run_demo()

TensorFlow Version: 2.18.0
TENSORFLOW DATA LOADING & PREPROCESSING DEMO

=== PREPARING SAMPLE DATA ===
Dataset shape: (20640, 11)
Features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'house_type', 'location_type']

First 5 rows:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude house_type location_type  target  
0    -122.23      condo      suburban   4.526  
1    -122.22  townhouse         urban   3.585  
2    -122.24  apartment         urban   3.521  
3    -122.25      condo      suburban   3.413  
4    -122.25      


Training model with Embedding...
Epoch 1/2
Training failed: Graph execution error:

Detected at node functional_1/concatenate_1_1/concat defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_


Training model with One-Hot...
Epoch 1/2
Training failed: Graph execution error:

Detected at node functional_1_1/concatenate_2_1/concat defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.
Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


Batch size: 32
Feature keys: ['AveBedrms', 'AveOccup', 'AveRooms', 'HouseAge', 'Latitude', 'Longitude', 'MedInc', 'Population', 'house_type', 'location_type']
  AveBedrms: shape=(32,), dtype=<dtype: 'float32'>
  AveOccup: shape=(32,), dtype=<dtype: 'float32'>
  AveRooms: shape=(32,), dtype=<dtype: 'float32'>
  HouseAge: shape=(32,), dtype=<dtype: 'float32'>
  Latitude: shape=(32,), dtype=<dtype: 'float32'>
  Longitude: shape=(32,), dtype=<dtype: 'float32'>
  MedInc: shape=(32,), dtype=<dtype: 'float32'>
  Population: shape=(32,), dtype=<dtype: 'float32'>
  house_type: shape=(32,), dtype=<dtype: 'string'>
  location_type: shape=(32,), dtype=<dtype: 'string'>

=== FEATURE COLUMNS APPROACH (LEGACY) ===

=== CREATING FEATURE COLUMNS (Embedding: True) ===

=== CREATING MODEL WITH FEATURE COLUMNS ===
Feature Columns approach failed: module 'keras._tf_keras.keras.layers' has no attribute 'DenseFeatures'

DEMO COMPLETED!
