# MPP Data Upload Notebook

This notebook processes and uploads Maximum Power Point (MPP) tracking data from text files to the TimescaleDB database.

## Purpose

- Scan directories for MPP data files (output_board{X}_channel{Y}.txt files)
- Parse the data into structured format
- Upload the data to the TimescaleDB database
- Avoid duplicate data entries

## Prerequisites

- Running TimescaleDB instance (configured in docker-compose.yml)
- Access to directory containing MPP data files
- Environment variables configured in .env file (for database connection)

## 1. Setup and Imports

Import required libraries and install any missing dependencies.

In [1]:
# Install required packages if not already installed
!pip install psycopg2-binary sqlalchemy pandas tqdm pathlib python-dotenv ipynb-path
import psycopg2

In [2]:
import ipynb_path
import os
import traceback

try:
    notebook_file_path = ipynb_path.get()
    print(f"Successfully got notebook path: {notebook_file_path}")
    notebook_dir = os.path.dirname(notebook_file_path)
    print(f"Notebook directory: {notebook_dir}")
    os.chdir(notebook_dir)
    print(f"Successfully changed CWD to: {os.getcwd()}")
except Exception as e:
    print(f"An error occurred: {e}")
    print("\nFull traceback:")
    traceback.print_exc()

In [3]:
# Core data processing libraries
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from pathlib import Path

# Database libraries
from sqlalchemy import create_engine, text

# Progress tracking
from tqdm.notebook import tqdm

# Environment variables
from dotenv import load_dotenv

# Logging
import logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')

## 2. Configuration

Load configuration from environment variables or use defaults.

In [4]:
# Load environment variables from .env file
# Look for the .env file two directories up from the notebook location
dotenv_path = Path("../../.env")
load_dotenv(dotenv_path)

# Database configuration from container environment variables with fallbacks
DB_CONFIG = {
    'host': os.getenv('POSTGRES_HOST', 'timescaledb'),  # Use container service name
    'port': int(os.getenv('POSTGRES_PORT', 5432)),
    'database': os.getenv('POSTGRES_DB', 'perocube'),
    'user': os.getenv('POSTGRES_USER', 'postgres'),
    'password': os.getenv('POSTGRES_PASSWORD', 'postgres')
}

# Print database connection info (excluding password)
print(f"Database connection: {DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']} as {DB_CONFIG['user']}")

# Data directory configuration - using relative path from notebook location
ROOT_DIRECTORY = str(Path("sample_data/datasets/PeroCube-sample-data").resolve())

# File matching pattern
MPP_FILE_PATTERN = r"output_board(\d+)_channel(\d+)"

# Batch size for database operations
BATCH_SIZE = 5000

# Data validation configuration
VALIDATION_CONFIG = {
    'enabled': False,  # Master switch for validation
    'remove_nan': True,  # Always remove NaN values
    'validate_ranges': False,  # Optional physical value validation
    'ranges': {
        'voltage': {'min': 0, 'max': 100},  # Voltage range in V
        'current': {'min': -1, 'max': 100}, # Current range in mA
        'power': {'min': 0, 'max': 1000}    # Power range in mW
    }
}

def print_validation_config():
    """Print current validation configuration for user awareness"""
    print("\nData Validation Configuration:")
    print(f"- Validation enabled: {VALIDATION_CONFIG['enabled']}")
    print(f"- Remove NaN values: {VALIDATION_CONFIG['remove_nan']}")
    if VALIDATION_CONFIG['enabled'] and VALIDATION_CONFIG['validate_ranges']:
        print("\nPhysical value ranges:")
        for measure, limits in VALIDATION_CONFIG['ranges'].items():
            print(f"- {measure}: {limits['min']} to {limits['max']}")
    else:
        print("\nPhysical value validation is disabled")

## 3. Utility Functions

Helper functions for database connection and data validation.

In [5]:
def create_db_connection(config=DB_CONFIG):
    """
    Create a SQLAlchemy database engine from configuration.
    
    Args:
        config: Dictionary containing database connection parameters
        
    Returns:
        SQLAlchemy engine instance
    """
    try:
        connection_string = f"postgresql://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}"
        # Store connection string as attribute of engine for external access
        engine = create_engine(connection_string)
        engine.connection_string = connection_string  # This makes it accessible via engine.connection_string
        
        # Test the connection
        with engine.connect() as conn:
            result = conn.execute(text("SELECT 1"))
            logging.info(f"Database connection successful: {config['host']}:{config['port']}/{config['database']}")
        return engine
    except Exception as e:
        logging.error(f"Database connection failed: {str(e)}")
        raise

def validate_mpp_data(df, config=VALIDATION_CONFIG):
    """
    Validate MPP data according to the specified configuration.
    
    Args:
        df: DataFrame containing MPP measurements
        config: Dictionary containing validation configuration
        
    Returns:
        Cleaned and validated DataFrame, along with validation statistics
    """
    if df.empty:
        return df, {'initial_count': 0, 'final_count': 0, 'removed': {}}
    
    stats = {
        'initial_count': len(df),
        'final_count': None,
        'removed': {
            'nan_values': 0,
            'voltage_range': 0,
            'current_range': 0,
            'power_range': 0
        }
    }
    
    # Always ensure timestamp is in UTC
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    
    # Remove NaN values if configured
    if config['remove_nan']:
        nan_count = df.isna().sum().sum()
        df = df.dropna()
        stats['removed']['nan_values'] = nan_count
    
    # Apply physical value validation if enabled
    if config['enabled'] and config['validate_ranges']:
        for measure, limits in config['ranges'].items():
            if measure in df.columns:
                invalid_count = len(df[~(df[measure].between(limits['min'], limits['max']))])
                df = df[df[measure].between(limits['min'], limits['max'])]
                stats['removed'][f'{measure}_range'] = invalid_count
    
    stats['final_count'] = len(df)
    
    # Log validation results
    logging.info("Validation statistics:")
    logging.info(f"Initial records: {stats['initial_count']}")
    if config['remove_nan']:
        logging.info(f"Removed NaN values: {stats['removed']['nan_values']}")
    if config['enabled'] and config['validate_ranges']:
        for measure in config['ranges'].keys():
            if stats['removed'][f'{measure}_range'] > 0:
                logging.info(f"Removed {measure} out of range: {stats['removed'][f'{measure}_range']}")
    logging.info(f"Final records: {stats['final_count']}")
    
    return df, stats

def check_existing_data(engine, board, channel, timestamps):
    """
    Check if data already exists in the database for given parameters.
    
    Args:
        engine: SQLAlchemy engine
        board: Board number
        channel: Channel number
        timestamps: List of timestamps to check
        
    Returns:
        Boolean indicating if data exists
    """
    if not timestamps:
        return False
        
    # For efficiency, just check the min and max timestamps
    min_timestamp = min(timestamps)
    max_timestamp = max(timestamps)
    
    # Build a query to check for existing data
    query = text("""
        SELECT COUNT(*)
        FROM mpp_measurement
        WHERE timestamp BETWEEN :min_timestamp AND :max_timestamp
          AND tracking_channel_board = :board_id
          AND tracking_channel_channel = :channel_id
    """)
    
    # Execute the query
    with engine.connect() as conn:
        result = conn.execute(query, {
            "min_timestamp": min_timestamp,
            "max_timestamp": max_timestamp,
            "board_id": board,
            "channel_id": channel
        })
        count = result.scalar()
        
    # If count > 0, some data exists
    return count > 0

In [6]:
def ensure_tracking_channels_exist(engine, channels):
    """
    Ensure tracking channels exist in the database before inserting measurements.
    
    Args:
        engine: SQLAlchemy engine
        channels: List of tuples containing (board, channel) pairs to check/create
        
    Returns:
        Set of (board, channel) tuples that were created
    """
    try:
        created = set()
        with engine.connect() as conn:
            for board, channel in channels:
                # Check if the tracking channel exists
                result = conn.execute(
                    text("""
                    SELECT 1 
                    FROM mpp_tracking_channel 
                    WHERE board = :board AND channel = :channel
                    """),
                    {"board": board, "channel": channel}
                )
                
                if not result.fetchone():
                    # Create the tracking channel if it doesn't exist
                    conn.execute(
                        text("""
                        INSERT INTO mpp_tracking_channel (board, channel) 
                        VALUES (:board, :channel)
                        """),
                        {"board": board, "channel": channel}
                    )
                    created.add((board, channel))
                    logging.info(f"Created tracking channel: board {board}, channel {channel}")
            
            conn.commit()
        return created
    except Exception as e:
        logging.error(f"Error ensuring tracking channels exist: {str(e)}")
        raise

## 4. MPP Data Processing Function

In [7]:
def process_mpp_files(root_dir, engine, pattern=MPP_FILE_PATTERN, batch_size=BATCH_SIZE, validate=VALIDATION_CONFIG['enabled']):
    """
    Crawls directories starting with 'data', finds files matching the pattern,
    reads them into pandas DataFrames, and uploads them to the DB.

    Args:
        root_dir: The root directory to start the search from.
        engine:   SQLAlchemy engine for database connection.
        pattern:  Regex pattern to match MPP files.
        batch_size: Number of rows to insert in one batch.
        validate:   Whether to perform data validation.
        
    Returns:
        Dictionary with statistics about the processing.
    """
    # Statistics to return
    stats = {
        'files_processed': 0,
        'files_skipped': 0,
        'files_error': 0,
        'rows_inserted': 0,
        'start_time': datetime.now(timezone.utc),
        'total_files': 0,
        'channels_created': 0
    }

    # Convert to Path object for better path handling
    root_path = Path(root_dir)
    if not root_path.exists():
        logging.error(f"Root directory does not exist: {root_dir}")
        return stats

    # Compile the regex pattern for efficiency
    pattern_compiled = re.compile(pattern)

    # First, collect all matching filepaths and required channels
    matching_files = []
    required_channels = set()
    for dirpath, dirnames, filenames in os.walk(root_path):
        path_parts = Path(dirpath).parts
        if any(part.startswith("data") for part in path_parts):
            for filename in filenames:
                filepath = Path(dirpath) / filename
                match = pattern_compiled.search(filename)
                if match:
                    board = int(match.group(1))
                    channel = int(match.group(2))
                    required_channels.add((board, channel))
                    matching_files.append((filepath, board, channel))

    stats['total_files'] = len(matching_files)
    logging.info(f"Found {len(matching_files)} MPP data files to process")

    # Ensure all required tracking channels exist
    logging.info(f"Ensuring {len(required_channels)} tracking channels exist...")
    try:
        created_channels = ensure_tracking_channels_exist(engine, required_channels)
        stats['channels_created'] = len(created_channels)
        if created_channels:
            logging.info(f"Created {len(created_channels)} new tracking channels")
    except Exception as e:
        logging.error(f"Failed to ensure tracking channels exist: {str(e)}")
        return stats

    # Now, process each file
    with tqdm(total=len(matching_files), desc="Processing MPP Files") as pbar:
        for filepath, board, channel in matching_files:
            try:
                logging.info(f"Processing MPP file: {filepath} (Board: {board}, Channel: {channel})")

                # Read the file into a pandas DataFrame
                df = pd.read_csv(filepath, sep='\t',
                              names=['timestamp', 'power', 'current', 'voltage'])
                                
                if df.empty:
                    logging.warning(f"Empty file: {filepath}")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue
                
                # Ensure timestamp is in UTC format
                df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

                # Add board and channel information to the DataFrame
                df['tracking_channel_board'] = board
                df['tracking_channel_channel'] = channel

                # Validate data if enabled
                if validate:
                    df, validation_stats = validate_mpp_data(df)
                    if df.empty:
                        logging.warning(f"All data filtered during validation: {filepath}")
                        stats['files_skipped'] += 1
                        pbar.update(1)
                        continue

                # Check for existing data
                timestamps = df['timestamp'].tolist()
                data_exists = check_existing_data(engine, board, channel, timestamps)

                if data_exists:
                    logging.info(f"Data already exists for {filepath}. Skipping file.")
                    stats['files_skipped'] += 1
                else:
                    # Upload data in batches for large files
                    total_rows = len(df)
                    for i in range(0, total_rows, batch_size):
                        batch_df = df.iloc[i:i+batch_size]
                        batch_df.to_sql('mpp_measurement', engine, if_exists='append', index=False)
                        
                    stats['rows_inserted'] += total_rows
                    stats['files_processed'] += 1
                    logging.info(f"Successfully uploaded {total_rows} rows from {filepath}")

                # Clean up
                del df
                pbar.update(1)

            except Exception as e:
                logging.error(f"Error processing {filepath}: {str(e)}")
                stats['files_error'] += 1
                pbar.update(1)

    # Calculate duration
    stats['end_time'] = datetime.now(timezone.utc)
    stats['duration_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds()
    
    logging.info(f"Processing complete. Processed {stats['files_processed']} files, "
                 f"skipped {stats['files_skipped']} files, "
                 f"errors in {stats['files_error']} files. "
                 f"Created {stats['channels_created']} tracking channels. "
                 f"Inserted {stats['rows_inserted']} data points in {stats['duration_seconds']:.2f} seconds.")
                 
    return stats

## 5. Execute the Data Upload Process

In [8]:
# Create database connection
try:
    engine = create_db_connection()
    logging.info("Database connection established successfully")
except Exception as e:
    logging.error(f"Failed to connect to database: {str(e)}")
    raise

In [9]:
# engine.connection_string

## Configure Data Validation

Review and adjust the validation settings before processing data.

In [10]:
# Review current validation configuration
print_validation_config()

# Uncomment and modify these lines to change validation settings
# VALIDATION_CONFIG['enabled'] = True
# VALIDATION_CONFIG['validate_ranges'] = True
# VALIDATION_CONFIG['ranges']['voltage']['max'] = 150  # Adjust range if needed

In [11]:
# Run the data processing with the configured root directory
print(f"Starting MPP data processing from directory: {ROOT_DIRECTORY}")
stats = process_mpp_files(ROOT_DIRECTORY, engine)

## 6. Results Summary

After processing the MPP data files, here's a summary of what was accomplished:

In [12]:
def format_duration(seconds):
    """Format duration in a human-readable format"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = seconds % 60
    if hours > 0:
        return f"{hours}h {minutes}m {secs:.1f}s"
    elif minutes > 0:
        return f"{minutes}m {secs:.1f}s"
    else:
        return f"{secs:.1f}s"

def format_number(n):
    """Format number with thousand separators"""
    return f"{n:,}"

# Display processing statistics
if 'stats' in locals():
    print("📊 File Processing")
    print("━━━━━━━━━━━━━━━")
    print(f"📁 Total files found:           {format_number(stats.get('total_files', 0)):>10}")
    print(f"✅ Successfully processed:      {format_number(stats.get('files_processed', 0)):>10}")
    print(f"⏭️  Skipped (existing/empty):    {format_number(stats.get('files_skipped', 0)):>10}")
    print(f"❌ Errors during processing:    {format_number(stats.get('files_error', 0)):>10}")
    
    print("\n📈 Data Statistics")
    print("━━━━━━━━━━━━━━━")
    print(f"📝 Data points inserted:        {format_number(stats.get('rows_inserted', 0)):>10}")
    
    if 'duration_seconds' in stats:
        duration = format_duration(stats['duration_seconds'])
        print("\n⚡ Performance Metrics")
        print("━━━━━━━━━━━━━━━━━━")
        print(f"⏱️  Total processing time:      {duration:>10}")
        
        if stats.get('rows_inserted', 0) > 0 and stats.get('duration_seconds', 0) > 0:
            throughput = stats['rows_inserted'] / stats['duration_seconds']
            print(f"🚀 Processing speed:           {format_number(int(throughput)):>10} rows/sec")

    # Database verification
    try:
        with engine.connect() as conn:
            result = conn.execute(text("SELECT COUNT(*) FROM mpp_measurement"))
            total_count = result.scalar()
            
            print("\n🗄️  Database Status")
            print("━━━━━━━━━━━━━━━━")
            print(f"💾 Total records in database:  {format_number(total_count):>10}")
            
            if stats.get('channels_created', 0) > 0:
                print(f"🔌 New channels created:       {format_number(stats.get('channels_created', 0)):>10}")
    except Exception as e:
        print("\n⚠️  Could not verify database status:")
        print(f"   {str(e)}")
else:
    print("❌ No statistics available - processing may have failed")
    print("   Please check the logs above for errors.")