# Simplified Deep Learning Features
Creates sequence data for LSTM, TFT and other DL models with simplified CPU processing


In [1]:
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("Starting simplified DL feature engineering...")


Starting simplified DL feature engineering...


In [2]:
def detect_environment():
    try:
        import google.colab
        from google.colab import drive
        drive.mount('/content/drive/')
        return 'colab', '/content/drive/MyDrive/fcst'
    except ImportError:
        return 'local', '..'

def install_cudf_colab():
    """Install cuDF in Google Colab if not available"""
    try:
        import cudf
        print("✓ cuDF already available")
        return True
    except ImportError:
        print("⚠ cuDF not found. Installing in Colab...")
        try:
            import google.colab
            import subprocess
            import sys

            # Install cuDF for CUDA 11
            subprocess.check_call([
                sys.executable, "-m", "pip", "install",
                "cudf-cu11", "--extra-index-url=https://pypi.nvidia.com"
            ])
            subprocess.check_call([
                sys.executable, "-m", "pip", "install", "cupy-cuda11x"
            ])

            print("✓ cuDF installed successfully")
            print("⚠ Please restart runtime and run again")
            return False
        except:
            print("⚠ Failed to install cuDF")
            return False

def setup_gpu():
    """Setup GPU for deep learning feature engineering with cuDF support"""
    gpu_info = {'available': False, 'type': None, 'device': None, 'cudf_available': False}

    # Check cuDF first (for GPU dataframes)
    try:
        import cudf
        import cupy as cp
        gpu_info['cudf_available'] = True
        gpu_info['available'] = True
        gpu_info['type'] = 'cudf'
        print(f"✓ cuDF available - GPU dataframe operations enabled")
        print(f"✓ GPU memory: {cp.cuda.runtime.memGetInfo()[1] / 1e9:.1f} GB")
    except ImportError:
        print("⚠ cuDF not available, trying PyTorch...")

        # Try to install in Colab
        environment, _ = detect_environment()
        if environment == 'colab':
            if install_cudf_colab():
                return setup_gpu()  # Retry after installation

    # Fallback to PyTorch
    if not gpu_info['available']:
        try:
            import torch
            if torch.cuda.is_available():
                gpu_info['available'] = True
                gpu_info['type'] = 'cuda'
                gpu_info['device'] = torch.device('cuda')
                print(f"✓ CUDA GPU: {torch.cuda.get_device_name(0)}")
            elif torch.backends.mps.is_available():
                gpu_info['available'] = True
                gpu_info['type'] = 'mps'
                gpu_info['device'] = torch.device('mps')
                print("✓ Apple MPS GPU available")
            else:
                print("⚠ GPU available but not CUDA/MPS compatible")
                gpu_info['device'] = torch.device('cpu')
        except ImportError:
            print("⚠ PyTorch not available, using CPU only")
            gpu_info['device'] = None

    return gpu_info

environment, base_path = detect_environment()
gpu_info = setup_gpu()

print(f"Environment: {environment}")
print(f"GPU support: {gpu_info['type'] if gpu_info['available'] else 'CPU only'}")
print(f"cuDF available: {gpu_info['cudf_available']}")


Mounted at /content/drive/
✓ cuDF available - GPU dataframe operations enabled
✓ GPU memory: 42.5 GB
Environment: colab
GPU support: cudf
cuDF available: True


In [3]:
print("\nLoading preprocessed data...")

# Load the unified data from feature_engineering_final.py
baseline_file = f'{base_path}/data/features/baseline_statistical_full.parquet'
if os.path.exists(baseline_file):
    print("Loading unified dataset...")
    df = pd.read_parquet(baseline_file)
else:
    # Fallback to separate files
    print("Loading from separate train/test files...")
    train_df = pd.read_parquet(f'{base_path}/data/preprocessed/train_with_target.parquet')
    test_df = pd.read_parquet(f'{base_path}/data/preprocessed/test_with_target.parquet')

    # Combine and mark splits
    df = pd.concat([train_df, test_df], ignore_index=True)
    train_indices = set(zip(train_df['client_id'], train_df['category'], pd.to_datetime(train_df['date'])))
    df['split'] = df.apply(
        lambda row: 'train' if (row['client_id'], row['category'], pd.to_datetime(row['date'])) in train_indices
        else 'test', axis=1
    )

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['client_id', 'category', 'date'])

print(f"Dataset: {len(df):,} records")
print(f"Train: {(df['split'] == 'train').sum():,}, Test: {(df['split'] == 'test').sum():,}")



Loading preprocessed data...
Loading unified dataset...
Dataset: 2,298,956 records
Train: 1,836,701, Test: 462,255


In [4]:
print("\n=== CREATING SIMPLIFIED DL SEQUENCES ===")

# Use 20% sample for speed
print("Using 20% sample for faster processing...")
df_sample = df.sample(frac=0.2, random_state=42).sort_values(['client_id', 'category', 'date'])
print(f"Sample size: {len(df_sample):,} records (from {len(df):,} total)")

def create_simplified_sequences(df, lookback=52, min_series_length=60):
    """Create simplified sequences without redundant features"""
    print(f"Creating simplified sequences (lookback={lookback}, min_length={min_series_length})...")

    sequences = []

    for (client_id, category), group in df.groupby(['client_id', 'category']):
        group = group.sort_values('date')

        if len(group) < min_series_length:
            continue

        for i in range(lookback, len(group)):
            sequence_data = group.iloc[i-lookback:i].copy()
            current_row = group.iloc[i].copy()

            seq_record = {
                'client_id': client_id,
                'category': category,
                'date': current_row['date'],
                'split': current_row['split'],

                # SINGLE TARGET
                'target': current_row['target'],

                # PREDICTION TIME CONTEXT
                'pred_month': current_row['date'].month,
                'pred_quarter': current_row['date'].quarter,

                # STATIC USER FEATURES
                'yearly_income': current_row.get('yearly_income', 0),
                'total_debt': current_row.get('total_debt', 0),
                'credit_score': current_row.get('credit_score', 0),
                'current_age': current_row.get('current_age', 0),
            }

            # HISTORICAL SEQUENCE (amount + month only, no redundant features)
            for j in range(lookback):
                week_data = sequence_data.iloc[j]
                seq_record[f'hist_amount_{j}'] = week_data['amount']
                seq_record[f'hist_month_{j}'] = week_data['date'].month

            # BASIC SEQUENCE STATISTICS (essential only)
            amounts = sequence_data['amount']
            seq_record['hist_mean'] = amounts.mean()
            seq_record['hist_std'] = amounts.std()
            seq_record['hist_trend'] = (amounts.iloc[-1] - amounts.iloc[0]) / lookback

            sequences.append(seq_record)

    print(f"✓ Created {len(sequences):,} sequences")
    return pd.DataFrame(sequences)

# Create simplified sequences
dl_sequences = create_simplified_sequences(df_sample, lookback=52, min_series_length=60)

print(f"DL sequences created: {len(dl_sequences):,} records")
print(f"Train sequences: {(dl_sequences['split'] == 'train').sum():,}")
print(f"Test sequences: {(dl_sequences['split'] == 'test').sum():,}")



=== CREATING SIMPLIFIED DL SEQUENCES ===
Using 20% sample for faster processing...
Sample size: 459,791 records (from 2,298,956 total)
Creating simplified sequences (lookback=52, min_length=60)...
✓ Created 159,659 sequences
DL sequences created: 159,659 records
Train sequences: 81,815
Test sequences: 77,844


In [5]:
def gpu_normalize_batch(data_matrix, means, stds, gpu_info):
    """Batch normalize multiple columns on GPU if available"""
    if gpu_info['cudf_available']:
        return _normalize_cudf(data_matrix, means, stds)
    elif gpu_info['available'] and len(data_matrix) > 10000:
        return _normalize_torch(data_matrix, means, stds, gpu_info)
    else:
        return (data_matrix - means) / (stds + 1e-8)

def _normalize_cudf(data_matrix, means, stds):
    """Normalization using cuDF/cupy"""
    try:
        import cudf
        import cupy as cp

        # Convert to cupy arrays
        cp_data = cp.asarray(data_matrix)
        cp_means = cp.asarray(means)
        cp_stds = cp.asarray(stds)

        # Normalize on GPU
        normalized = (cp_data - cp_means) / (cp_stds + 1e-8)

        # Return to CPU
        result = cp.asnumpy(normalized)
        print(f"  ✓ cuDF GPU normalization: {data_matrix.shape} -> {result.shape}")
        return result

    except Exception as e:
        print(f"  ⚠ cuDF normalization failed: {e}, using CPU")
        return (data_matrix - means) / (stds + 1e-8)

def _normalize_torch(data_matrix, means, stds, gpu_info):
    """Normalization using PyTorch"""
    try:
        import torch

        tensor_data = torch.tensor(data_matrix, device=gpu_info['device'], dtype=torch.float32)
        tensor_means = torch.tensor(means, device=gpu_info['device'], dtype=torch.float32)
        tensor_stds = torch.tensor(stds, device=gpu_info['device'], dtype=torch.float32)

        normalized = (tensor_data - tensor_means) / (tensor_stds + 1e-8)

        result = normalized.cpu().numpy()
        print(f"  ✓ PyTorch GPU normalization: {data_matrix.shape} -> {result.shape}")
        return result

    except Exception as e:
        print(f"  ⚠ PyTorch normalization failed: {e}, using CPU")
        return (data_matrix - means) / (stds + 1e-8)


In [6]:
print("\n=== GPU-ACCELERATED NORMALIZATION ===")

if len(dl_sequences) > 0:
    # Identify columns for normalization
    numeric_cols = []

    # Historical sequence features (updated naming)
    for i in range(52):
        numeric_cols.extend([f'hist_amount_{i}', f'hist_month_{i}'])

    # Static user features
    static_features = ['yearly_income', 'total_debt', 'credit_score', 'current_age']
    numeric_cols.extend(static_features)

    # Historical sequence statistics
    hist_stats = ['hist_mean', 'hist_std', 'hist_trend']
    numeric_cols.extend(hist_stats)

    # Filter existing columns
    numeric_cols = [col for col in numeric_cols if col in dl_sequences.columns]

    print(f"Normalizing {len(numeric_cols)} numeric columns...")
    print(f"Features: {len([c for c in numeric_cols if 'hist_amount' in c])} hist_amount, "
          f"{len([c for c in numeric_cols if 'hist_month' in c])} hist_month, "
          f"{len([c for c in numeric_cols if c in static_features])} static, "
          f"{len([c for c in numeric_cols if c in hist_stats])} statistics")

    # Compute normalization stats from training data only
    train_mask = dl_sequences['split'] == 'train'
    train_data = dl_sequences.loc[train_mask, numeric_cols]

    train_means = train_data.mean().values
    train_stds = train_data.std().values

    # Handle zero std (constant columns)
    train_stds = np.where(train_stds == 0, 1.0, train_stds)

    # Apply GPU normalization to full dataset
    all_data = dl_sequences[numeric_cols].values

    print(f"Data shape: {all_data.shape}")
    print(f"GPU type: {gpu_info['type'] if gpu_info['available'] else 'CPU'}")

    normalized_data = gpu_normalize_batch(all_data, train_means, train_stds, gpu_info)

    # Add normalized columns
    for i, col in enumerate(numeric_cols):
        dl_sequences[f'{col}_norm'] = normalized_data[:, i]

    print(f"✓ Normalized {len(numeric_cols)} features using training statistics")



=== GPU-ACCELERATED NORMALIZATION ===
Normalizing 111 numeric columns...
Features: 52 hist_amount, 52 hist_month, 4 static, 3 statistics
Data shape: (159659, 111)
GPU type: cudf
  ✓ cuDF GPU normalization: (159659, 111) -> (159659, 111)
✓ Normalized 111 features using training statistics


In [7]:
print("\n=== PREPARING TRAINING DATA ===")

def prepare_training_data(df):
    """Separate features (X) from targets (y) for proper model training"""

    # FEATURES (X) - everything except target
    feature_cols = []

    # Historical sequence features
    for i in range(52):
        feature_cols.extend([f'hist_amount_{i}_norm', f'hist_month_{i}'])

    # Static features
    feature_cols.extend(['yearly_income_norm', 'total_debt_norm', 'credit_score_norm', 'current_age_norm'])

    # Sequence statistics
    feature_cols.extend(['hist_mean_norm', 'hist_std_norm', 'hist_trend_norm'])

    # Prediction context
    feature_cols.extend(['pred_month', 'pred_quarter'])

    # Filter existing columns
    feature_cols = [col for col in feature_cols if col in df.columns]

    # TARGETS (y)
    targets = df['target'].values

    # FEATURES (X)
    features = df[feature_cols].values

    # METADATA
    metadata = df[['client_id', 'category', 'date', 'split']].copy()

    print(f"✓ Features (X): {features.shape} - {len(feature_cols)} columns")
    print(f"✓ Targets (y): {targets.shape}")
    print(f"✓ Train samples: {(df['split'] == 'train').sum():,}")
    print(f"✓ Test samples: {(df['split'] == 'test').sum():,}")

    return features, targets, metadata, feature_cols

# Prepare training data
X, y, metadata, feature_names = prepare_training_data(dl_sequences)

# Split train/test
train_mask = metadata['split'] == 'train'
test_mask = metadata['split'] == 'test'

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]

print(f"\nTrain set: X_train{X_train.shape}, y_train{y_train.shape}")
print(f"Test set: X_test{X_test.shape}, y_test{y_test.shape}")



=== PREPARING TRAINING DATA ===
✓ Features (X): (159659, 113) - 113 columns
✓ Targets (y): (159659,)
✓ Train samples: 81,815
✓ Test samples: 77,844

Train set: X_train(81815, 113), y_train(81815,)
Test set: X_test(77844, 113), y_test(77844,)


In [8]:
print("\n=== CREATING SPECIALIZED DL FORMATS ===")

# 1. LSTM Format: (samples, timesteps, features)
def create_lstm_format(df):
    """Create LSTM-ready sequences with simplified features"""
    lstm_data = []

    for idx, row in df.iterrows():
        # Create sequence matrix (52 timesteps x 2 features)
        sequence = []
        for i in range(52):
            timestep = [
                row[f'hist_amount_{i}_norm'],
                row[f'hist_month_{i}'] / 12.0,  # Normalize month
            ]
            sequence.append(timestep)

        lstm_data.append({
            'client_id': row['client_id'],
            'category': row['category'],
            'date': row['date'],
            'split': row['split'],
            'sequence': sequence,  # 52 x 2 array
            'target': row['target'],
            # Static features
            'yearly_income_norm': row.get('yearly_income_norm', 0),
            'total_debt_norm': row.get('total_debt_norm', 0),
            'credit_score_norm': row.get('credit_score_norm', 0),
            'current_age_norm': row.get('current_age_norm', 0),
            # Prediction context
            'pred_month': row['pred_month'],
            'pred_quarter': row['pred_quarter'],
            # Sequence stats
            'hist_mean_norm': row.get('hist_mean_norm', 0),
            'hist_std_norm': row.get('hist_std_norm', 0),
            'hist_trend_norm': row.get('hist_trend_norm', 0),
        })

    return pd.DataFrame(lstm_data)

# 2. TFT Format: flat features with time index
def create_tft_format(df):
    """Create TFT-ready format with simplified features"""
    tft_data = []

    for idx, row in df.iterrows():
        # Create one record per timestep in the sequence
        for i in range(52):
            tft_record = {
                'series_id': f"{row['client_id']}_{row['category']}",
                'time_idx': i,
                'split': row['split'],

                # Time-varying features (historical)
                'amount': row[f'hist_amount_{i}_norm'],
                'month': row[f'hist_month_{i}'],

                # Static features (same for all timesteps)
                'yearly_income': row.get('yearly_income_norm', 0),
                'total_debt': row.get('total_debt_norm', 0),
                'credit_score': row.get('credit_score_norm', 0),
                'current_age': row.get('current_age_norm', 0),
            }
            tft_data.append(tft_record)

        # Add target record (what we're predicting)
        target_record = {
            'series_id': f"{row['client_id']}_{row['category']}",
            'time_idx': 52,  # Next timestep
            'split': row['split'],
            'target': row['target'],

            # Use prediction context
            'month': row['pred_month'],

            # Static features
            'yearly_income': row.get('yearly_income_norm', 0),
            'total_debt': row.get('total_debt_norm', 0),
            'credit_score': row.get('credit_score_norm', 0),
            'current_age': row.get('current_age_norm', 0),
        }
        tft_data.append(target_record)

    return pd.DataFrame(tft_data)

# Create specialized formats
lstm_format = create_lstm_format(dl_sequences)
tft_format = create_tft_format(dl_sequences)

print(f"LSTM format: {len(lstm_format):,} sequences")
print(f"TFT format: {len(tft_format):,} records")



=== CREATING SPECIALIZED DL FORMATS ===
LSTM format: 159,659 sequences
TFT format: 8,461,927 records


In [9]:
def extract_lstm_arrays(lstm_df, split='train'):
    """Extract LSTM sequences as numpy arrays for training"""
    subset = lstm_df[lstm_df['split'] == split]

    # Sequences: (samples, timesteps, features)
    sequences = np.array([seq for seq in subset['sequence'].values])

    # Targets
    targets = subset['target'].values

    # Static features (can be concatenated or used separately)
    static_features = subset[['yearly_income_norm', 'total_debt_norm', 'credit_score_norm',
                             'current_age_norm', 'pred_month', 'pred_quarter',
                             'hist_mean_norm', 'hist_std_norm', 'hist_trend_norm']].values

    print(f"{split.title()} - Sequences: {sequences.shape}, Targets: {targets.shape}, Static: {static_features.shape}")
    return sequences, targets, static_features

# Example usage for LSTM
print("\nLSTM arrays:")
lstm_train_seq, lstm_train_y, lstm_train_static = extract_lstm_arrays(lstm_format, 'train')
lstm_test_seq, lstm_test_y, lstm_test_static = extract_lstm_arrays(lstm_format, 'test')



LSTM arrays:
Train - Sequences: (81815, 52, 2), Targets: (81815,), Static: (81815, 9)
Test - Sequences: (77844, 52, 2), Targets: (77844,), Static: (77844, 9)


In [10]:
print("\n=== SAVING DL FEATURES ===")

output_dir = f'{base_path}/data/features'
os.makedirs(output_dir, exist_ok=True)

# 1. Full sequences with all features
dl_sequences.to_csv(f'{output_dir}/dl_sequences_full.csv', index=False)
dl_sequences.to_parquet(f'{output_dir}/dl_sequences_full.parquet', index=False)

# 2. Train/test splits for convenience
dl_train = dl_sequences[dl_sequences['split'] == 'train'].copy()
dl_test = dl_sequences[dl_sequences['split'] == 'test'].copy()

dl_train.to_csv(f'{output_dir}/dl_sequences_train.csv', index=False)
dl_test.to_csv(f'{output_dir}/dl_sequences_test.csv', index=False)
dl_train.to_parquet(f'{output_dir}/dl_sequences_train.parquet', index=False)
dl_test.to_parquet(f'{output_dir}/dl_sequences_test.parquet', index=False)

# 3. LSTM format
lstm_format.to_parquet(f'{output_dir}/dl_lstm_format.parquet', index=False)

# 4. TFT format
tft_format.to_parquet(f'{output_dir}/dl_tft_format.parquet', index=False)

# 5. Save normalization stats for inference
normalization_stats = {}
if len(numeric_cols) > 0:
    for i, col in enumerate(numeric_cols):
        normalization_stats[f'{col}_mean'] = float(train_means[i])
        normalization_stats[f'{col}_std'] = float(train_stds[i])

normalization_stats['dl_info'] = {
    'lookback_length': 52,
    'sequence_features': 2,  # hist_amount, hist_month
    'static_features': 4,    # yearly_income, total_debt, credit_score, current_age
    'prediction_features': 2, # pred_month, pred_quarter
    'sequence_statistics': 3, # hist_mean, hist_std, hist_trend
    'total_sequences': len(dl_sequences),
    'train_sequences': len(dl_train),
    'test_sequences': len(dl_test),
    'data_leakage_fixed': True,
    'targets_separated': True,
}

with open(f'{output_dir}/dl_normalization_stats.json', 'w') as f:
    json.dump(normalization_stats, f, indent=2)



=== SAVING DL FEATURES ===


In [11]:
print("\n=== SIMPLIFIED DL FEATURE ENGINEERING SUMMARY ===")
print(f"Environment: {environment}")
print(f"Processing: CPU-based with 20% sample")
print(f"Lookback window: 52 weeks")
print(f"Sequence features: 2 per timestep (amount, month)")

print(f"\nDataset statistics:")
print(f"  Sample size: {len(df_sample):,} records (20% of total)")
print(f"  Total sequences: {len(dl_sequences):,}")
print(f"  Train sequences: {len(dl_train):,}")
print(f"  Test sequences: {len(dl_test):,}")
print(f"  Features per sequence: {len([c for c in dl_sequences.columns if c.endswith('_norm')])} normalized")

print(f"\nFiles created:")
print(f"1. Full sequences:")
print(f"   - dl_sequences_full.parquet (all features)")
print(f"2. Train/test splits:")
print(f"   - dl_sequences_train.parquet")
print(f"   - dl_sequences_test.parquet")
print(f"3. Model-specific formats:")
print(f"   - dl_lstm_format.parquet (for LSTM models)")
print(f"   - dl_tft_format.parquet (for TFT models)")
print(f"4. Reference:")
print(f"   - dl_normalization_stats.json")

print(f"\n🎯 UNIFIED TEST DATES:")
print(f"✓ All DL models predict the same {len(dl_test):,} test sequences")
print(f"✓ Same test period as baseline/ML models")

print(f"\n🔒 DATA LEAKAGE PREVENTION:")
print(f"✓ Single target separated from input features")
print(f"✓ No 'current period' features in inputs")
print(f"✓ Only historical data (hist_) used for prediction")
print(f"✓ Static features constant over time")
print(f"✓ Prediction time features (pred_) for context only")

print(f"\n📊 SIMPLIFIED FEATURE STRUCTURE:")
print(f"✓ Historical sequence: {52 * 2:,} features (52 weeks × 2 per week)")
print(f"✓ Static user features: 4 features")
print(f"✓ Sequence statistics: 3 features")
print(f"✓ Prediction time context: 2 features")
print(f"✓ Total input features: {52 * 2 + 4 + 3 + 2:,}")

print(f"\n⚡ PERFORMANCE:")
print(f"✓ 20% sample for faster processing")
print(f"✓ Removed redundant features")
print(f"✓ Parquet format for fast I/O")
print(f"✓ Proper temporal constraints (no data leakage)")

print(f"\n🚀 Ready for model training!")
print(f"   - LSTM: Use extract_lstm_arrays() function")
print(f"   - TFT: Use dl_tft_format.parquet")
print(f"   - Custom: Use X_train, y_train arrays")

print(f"\n💡 TRAINING EXAMPLES:")
print(f"   # LSTM: lstm_train_seq{lstm_train_seq.shape}, lstm_train_y{lstm_train_y.shape}")
print(f"   # Dense: X_train{X_train.shape}, y_train{y_train.shape}")
print(f"   # Target properly separated for supervised learning")


=== SIMPLIFIED DL FEATURE ENGINEERING SUMMARY ===
Environment: colab
Processing: CPU-based with 20% sample
Lookback window: 52 weeks
Sequence features: 2 per timestep (amount, month)

Dataset statistics:
  Sample size: 459,791 records (20% of total)
  Total sequences: 159,659
  Train sequences: 81,815
  Test sequences: 77,844
  Features per sequence: 111 normalized

Files created:
1. Full sequences:
   - dl_sequences_full.parquet (all features)
2. Train/test splits:
   - dl_sequences_train.parquet
   - dl_sequences_test.parquet
3. Model-specific formats:
   - dl_lstm_format.parquet (for LSTM models)
   - dl_tft_format.parquet (for TFT models)
4. Reference:
   - dl_normalization_stats.json

🎯 UNIFIED TEST DATES:
✓ All DL models predict the same 77,844 test sequences
✓ Same test period as baseline/ML models

🔒 DATA LEAKAGE PREVENTION:
✓ Single target separated from input features
✓ No 'current period' features in inputs
✓ Only historical data (hist_) used for prediction
✓ Static feature

In [15]:
dl_sequences_train

NameError: name 'dl_sequences_train' is not defined