# MPP Data Upload Notebook

This notebook processes and uploads Maximum Power Point (MPP) tracking data from text files to the TimescaleDB database.

## Purpose

- Scan directories for MPP data files (output_board{X}_channel{Y}.txt files)
- Parse the data into structured format
- Upload the data to the TimescaleDB database
- Avoid duplicate data entries

## Prerequisites

- Running TimescaleDB instance (configured in docker-compose.yml)
- Access to directory containing MPP data files
- Environment variables configured in .env file (for database connection)

## 1. Setup and Imports

Import required libraries and install any missing dependencies.

In [None]:
# Core data processing libraries
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from pathlib import Path

# Database libraries
from sqlalchemy import create_engine, text

# Progress tracking
from tqdm.notebook import tqdm

# Environment variables
from dotenv import load_dotenv

# Logging
import logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
# Install required packages if not already installed
!pip install psycopg2-binary sqlalchemy pandas tqdm pathlib python-dotenv
import psycopg2

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


## 2. Configuration

Load configuration from environment variables or use defaults.

In [None]:
# Load environment variables from .env file
# Look for the .env file two directories up from the notebook location
dotenv_path = Path("../../.env")
load_dotenv(dotenv_path)

# Database configuration from environment variables with fallbacks
DB_CONFIG = {
    'host': os.getenv('DB_HOST', 'timescaledb'),
    'port': int(os.getenv('DB_PORT', 5432)),
    'database': os.getenv('DB_NAME', 'perocube'),
    'user': os.getenv('DB_USER', 'postgres'),
    'password': os.getenv('DB_PASSWORD', 'postgres')
}

# Print database connection info (excluding password)
print(f"Database connection: {DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']} as {DB_CONFIG['user']}")

# Data directory configuration
ROOT_DIRECTORY = os.getenv('DEFAULT_DATA_DIR', "../../sample_data/datasets/PeroCube-sample-data")

# File matching pattern
MPP_FILE_PATTERN = r"output_board(\d+)_channel(\d+)"

# Batch size for database operations
BATCH_SIZE = 5000

# Flag to enable/disable data validation
VALIDATE_DATA = True

## 3. Utility Functions

Helper functions for database connection and data validation.

In [None]:
def create_db_connection(config=DB_CONFIG):
    """
    Create a SQLAlchemy database engine from configuration.
    
    Args:
        config: Dictionary containing database connection parameters
        
    Returns:
        SQLAlchemy engine instance
    """
    try:
        connection_string = f"postgresql://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}"
        engine = create_engine(connection_string)
        # Test the connection
        with engine.connect() as conn:
            result = conn.execute(text("SELECT 1"))
            logging.info(f"Database connection successful: {config['host']}:{config['port']}/{config['database']}")
        return engine
    except Exception as e:
        logging.error(f"Database connection failed: {str(e)}")
        raise

def validate_mpp_data(df):
    """
    Validate MPP data for common issues and clean as needed.
    
    Args:
        df: DataFrame containing MPP measurements
        
    Returns:
        Cleaned and validated DataFrame
    """
    if df.empty:
        return df
    
    original_count = len(df)
    
    # Remove rows with NaN values
    df = df.dropna()
    
    # Ensure timestamp is in UTC
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    
    # Filter out physically impossible values
    if 'voltage' in df.columns:
        df = df[(df['voltage'] >= 0) & (df['voltage'] < 100)]  # Assuming voltage range
    
    if 'current' in df.columns:
        df = df[(df['current'] >= -1) & (df['current'] < 100)]  # Assuming current range
    
    if 'power' in df.columns:
        df = df[(df['power'] >= 0) & (df['power'] < 1000)]  # Assuming power range
    
    # Log validation results
    filtered_count = len(df)
    if filtered_count < original_count:
        logging.info(f"Filtered out {original_count - filtered_count} invalid records")
    
    return df

def check_existing_data(engine, board, channel, timestamps):
    """
    Check if data already exists in the database for given parameters.
    
    Args:
        engine: SQLAlchemy engine
        board: Board number
        channel: Channel number
        timestamps: List of timestamps to check
        
    Returns:
        Boolean indicating if data exists
    """
    if not timestamps:
        return False
        
    # For efficiency, just check the min and max timestamps
    min_timestamp = min(timestamps)
    max_timestamp = max(timestamps)
    
    # Build a query to check for existing data
    query = text("""
        SELECT COUNT(*)
        FROM mpp_measurement
        WHERE timestamp BETWEEN :min_timestamp AND :max_timestamp
          AND tracking_channel_board = :board_id
          AND tracking_channel_channel = :channel_id
    """)
    
    # Execute the query
    with engine.connect() as conn:
        result = conn.execute(query, {
            "min_timestamp": min_timestamp,
            "max_timestamp": max_timestamp,
            "board_id": board,
            "channel_id": channel
        })
        count = result.scalar()
        
    # If count > 0, some data exists
    return count > 0

## 4. MPP Data Processing Function

In [None]:
def process_mpp_files(root_dir, engine, pattern=MPP_FILE_PATTERN, batch_size=BATCH_SIZE, validate=VALIDATE_DATA):
    """
    Crawls directories starting with 'data', finds files matching the pattern,
    reads them into pandas DataFrames, and uploads them to the DB.

    Args:
        root_dir: The root directory to start the search from.
        engine:   SQLAlchemy engine for database connection.
        pattern:  Regex pattern to match MPP files.
        batch_size: Number of rows to insert in one batch.
        validate:   Whether to perform data validation.
        
    Returns:
        Dictionary with statistics about the processing.
    """
    # Statistics to return
    stats = {
        'files_processed': 0,
        'files_skipped': 0,
        'files_error': 0,
        'rows_inserted': 0,
        'start_time': datetime.now(timezone.utc)
        'total_files': 0
    }

    # Convert to Path object for better path handling
    root_path = Path(root_dir)
    if not root_path.exists():
        logging.error(f"Root directory does not exist: {root_dir}")
        return stats

    # Compile the regex pattern for efficiency
    pattern_compiled = re.compile(pattern)

    # First, collect all matching filepaths
    matching_files = []
    for dirpath, dirnames, filenames in os.walk(root_path):
        path_parts = Path(dirpath).parts
        if any(part.startswith("data") for part in path_parts):
            for filename in filenames:
                filepath = Path(dirpath) / filename
                if pattern_compiled.search(filename):
                    matching_files.append(filepath)

    stats['total_files'] = len(matching_files)
    logging.info(f"Found {len(matching_files)} MPP data files to process")

    # Now, process the collected filepaths
    with tqdm(total=len(matching_files), desc="Processing MPP Files") as pbar:
        for filepath in matching_files:
            try:
                # Extract filename for pattern matching
                filename = filepath.name
                match = pattern_compiled.search(filename)

                if not match:
                    logging.warning(f"Skipping file without proper format: {filepath}")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue

                # Extract board and channel numbers
                board = int(match.group(1))
                channel = int(match.group(2))

                # Log processing information
                logging.info(f"Processing MPP file: {filepath} (Board: {board}, Channel: {channel})")

                # Read the file into a pandas DataFrame
                df = pd.read_csv(filepath, sep='\t',
                                names=['timestamp', 'power', 'current', 'voltage'])
                                
                if df.empty:
                    logging.warning(f"Empty file: {filepath}")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue
                
                # Ensure timestamp is in UTC format
                df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

                # Add board and channel information to the DataFrame
                df['tracking_channel_board'] = board
                df['tracking_channel_channel'] = channel

                # Validate data if enabled
                if validate:
                    df = validate_mpp_data(df)
                    if df.empty:
                        logging.warning(f"All data filtered during validation: {filepath}")
                        stats['files_skipped'] += 1
                        pbar.update(1)
                        continue

                # Check for existing data
                timestamps = df['timestamp'].tolist()
                data_exists = check_existing_data(engine, board, channel, timestamps)

                if data_exists:
                    logging.info(f"Data already exists for {filepath}. Skipping file.")
                    stats['files_skipped'] += 1
                else:
                    # Upload data in batches for large files
                    total_rows = len(df)
                    for i in range(0, total_rows, batch_size):
                        batch_df = df.iloc[i:i+batch_size]
                        batch_df.to_sql('mpp_measurement', engine, if_exists='append', index=False)
                        
                    stats['rows_inserted'] += total_rows
                    stats['files_processed'] += 1
                    logging.info(f"Successfully uploaded {total_rows} rows from {filepath}")

                # Clean up
                del df
                pbar.update(1)

            except Exception as e:
                logging.error(f"Error processing {filepath}: {str(e)}")
                stats['files_error'] += 1
                pbar.update(1)

    # Calculate duration
    stats['end_time'] = datetime.now(timezone.utc)
    stats['duration_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds()
    
    logging.info(f"Processing complete. Processed {stats['files_processed']} files, "
                 f"skipped {stats['files_skipped']} files, "
                 f"errors in {stats['files_error']} files. "
                 f"Inserted {stats['rows_inserted']} data points in {stats['duration_seconds']:.2f} seconds.")
                 
    return stats

## 5. Execute the Data Upload Process

In [None]:
# Create database connection
try:
    engine = create_db_connection()
    logging.info("Database connection established successfully")
except Exception as e:
    logging.error(f"Failed to connect to database: {str(e)}")
    raise

In [None]:
# Run the data processing with the configured root directory
print(f"Starting MPP data processing from directory: {ROOT_DIRECTORY}")
stats = process_mpp_files(ROOT_DIRECTORY, engine)

Processing MPP Files:   0%|          | 0/1897 [00:00<?, ?it/s]

## 6. Results Summary

In [None]:
# Display processing statistics
if 'stats' in locals():
    print("\nProcessing Summary:")
    print(f"- Total files found: {stats.get('total_files', 0)}")
    print(f"- Files successfully processed: {stats.get('files_processed', 0)}")
    print(f"- Files skipped (empty or existing data): {stats.get('files_skipped', 0)}")
    print(f"- Files with errors: {stats.get('files_error', 0)}")
    print(f"- Total data points inserted: {stats.get('rows_inserted', 0)}")
    if 'duration_seconds' in stats:
        print(f"- Processing time: {stats['duration_seconds']:.2f} seconds")
        if stats.get('rows_inserted', 0) > 0 and stats.get('duration_seconds', 0) > 0:
            throughput = stats['rows_inserted'] / stats['duration_seconds']
            print(f"- Throughput: {throughput:.2f} rows/second")
else:
    print("No statistics available. Processing may have failed.")

# Verify database counts
try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT COUNT(*) FROM mpp_measurement"))
        total_count = result.scalar()
        print(f"\nTotal records in mpp_measurement table: {total_count}")
except Exception as e:
    print(f"Could not query database: {str(e)}")