# Temperature & Irradiance Data Upload Notebook

This notebook processes and uploads temperature and irradiance sensor data from text files to the TimescaleDB database.

## Purpose

- Scan directories for temperature and irradiance data files
- Register sensor metadata in the database
- Upload measurement data to the TimescaleDB database
- Avoid duplicate data entries

## Prerequisites

- Running TimescaleDB instance (configured in docker-compose.yml)
- Access to directory containing temperature and irradiance data files
- Environment variables configured in .env file (for database connection)

## 1. Setup and Imports

Import required libraries and install any missing dependencies.

In [None]:
# Core data processing libraries
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime, timezone
from pathlib import Path

# Database libraries
from sqlalchemy import create_engine, text

# Progress tracking
from tqdm.notebook import tqdm

# Environment variables
from dotenv import load_dotenv

# Logging
import logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')



In [None]:
# Install required packages if not already installed
!pip install psycopg2-binary sqlalchemy pandas tqdm pathlib python-dotenv
import psycopg2

## 2. Configuration

Load configuration from environment variables or use defaults.

In [None]:
# Load environment variables from .env file
# Look for the .env file two directories up from the notebook location
dotenv_path = Path("../../.env")
load_dotenv(dotenv_path)

# Database configuration from environment variables with fallbacks
DB_CONFIG = {
    'host': os.getenv('DB_HOST', 'timescaledb'),
    'port': int(os.getenv('DB_PORT', 5432)),
    'database': os.getenv('DB_NAME', 'perocube'),
    'user': os.getenv('DB_USER', 'postgres'),
    'password': os.getenv('DB_PASSWORD', 'postgres')
}

# Print database connection info (excluding password)
print(f"Database connection: {DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']} as {DB_CONFIG['user']}")

# Data directory configuration
ROOT_DIRECTORY = os.getenv('DEFAULT_DATA_DIR', "../../sample_data/datasets/PeroCube-sample-data")

# File matching patterns
IRRADIANCE_FILE_PATTERN = r"PT-(\d+)_channel_(\d+)"
TEMPERATURE_FILE_PATTERN = r"m7004_ID_(\w+)"

# Batch size for database operations
BATCH_SIZE = 5000

# Flag to enable/disable data validation
VALIDATE_DATA = True

# Table names
IRRADIANCE_SENSOR_TABLE = 'irradiance_sensor'
IRRADIANCE_MEASUREMENT_TABLE = 'irradiance_measurement'
TEMPERATURE_SENSOR_TABLE = 'temperature_sensor'
TEMPERATURE_MEASUREMENT_TABLE = 'temperature_measurement'

## 3. Utility Functions

Helper functions for database connection and data validation.

In [None]:
def create_db_connection(config=DB_CONFIG):
    """
    Create a SQLAlchemy database engine from configuration.
    
    Args:
        config: Dictionary containing database connection parameters
        
    Returns:
        SQLAlchemy engine instance
    """
    try:
        connection_string = f"postgresql://{config['user']}:{config['password']}@{config['host']}:{config['port']}/{config['database']}"
        engine = create_engine(connection_string)
        # Test the connection
        with engine.connect() as conn:
            result = conn.execute(text("SELECT 1"))
            logging.info(f"Database connection successful: {config['host']}:{config['port']}/{config['database']}")
        return engine
    except Exception as e:
        logging.error(f"Database connection failed: {str(e)}")
        raise

def validate_irradiance_data(df):
    """
    Validate irradiance data for common issues and clean as needed.
    
    Args:
        df: DataFrame containing irradiance measurements
        
    Returns:
        Cleaned and validated DataFrame
    """
    if df.empty:
        return df
    
    original_count = len(df)
    
    # Remove rows with NaN values
    df = df.dropna()
    
    # Ensure timestamp is in UTC
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    
    # Filter out physically impossible values
    if 'irradiance' in df.columns:
        # Typical irradiance range: 0 to 1500 W/m²
        df = df[(df['irradiance'] >= 0) & (df['irradiance'] < 1500)]
    
    # Log validation results
    filtered_count = len(df)
    if filtered_count < original_count:
        logging.info(f"Filtered out {original_count - filtered_count} invalid irradiance records")
    
    return df

def validate_temperature_data(df):
    """
    Validate temperature data for common issues and clean as needed.
    
    Args:
        df: DataFrame containing temperature measurements
        
    Returns:
        Cleaned and validated DataFrame
    """
    if df.empty:
        return df
    
    original_count = len(df)
    
    # Remove rows with NaN values
    df = df.dropna()
    
    # Ensure timestamp is in UTC
    if 'timestamp' in df.columns:
        df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)
    
    # Filter out physically impossible values for outdoor measurements
    if 'temperature' in df.columns:
        # Reasonable temperature range for outdoor monitoring: -50°C to 100°C
        df = df[(df['temperature'] >= -50) & (df['temperature'] <= 100)]
    
    # Log validation results
    filtered_count = len(df)
    if filtered_count < original_count:
        logging.info(f"Filtered out {original_count - filtered_count} invalid temperature records")
    
    return df

def check_existing_data(engine, table, sensor_id_col, sensor_id, timestamp, id_field='id'):
    """
    Check if data already exists in the database for given parameters.
    
    Args:
        engine: SQLAlchemy engine
        table: Table name
        sensor_id_col: Column name for sensor ID
        sensor_id: Sensor ID value
        timestamp: Timestamp to check
        id_field: Primary key field name
        
    Returns:
        Boolean indicating if data exists
    """
    if not timestamp:
        return False
        
    # Build a query to check for existing data
    query = text(f"""
        SELECT {id_field}
        FROM {table}
        WHERE timestamp = :timestamp
          AND {sensor_id_col} = :sensor_id
        LIMIT 1
    """)
    
    # Execute the query
    with engine.connect() as conn:
        result = conn.execute(query, {
            "timestamp": timestamp,
            "sensor_id": sensor_id
        })
        row = result.fetchone()
        
    # If row is not None, data exists
    return row is not None

def get_sensor_id(engine, sensor_table, name_col, name_val, channel_col=None, channel_val=None):
    """
    Get sensor ID from the database or create if it doesn't exist.
    
    Args:
        engine: SQLAlchemy engine
        sensor_table: Sensor table name
        name_col: Column name for sensor name
        name_val: Sensor name value
        channel_col: Column name for channel (optional)
        channel_val: Channel value (optional)
        
    Returns:
        Sensor ID
    """
    # Build the query based on whether channel is provided
    if channel_col and channel_val is not None:
        query = text(f"""
            SELECT {sensor_table}_id FROM {sensor_table} 
            WHERE {name_col} = :name_val AND {channel_col} = :channel_val
        """)
        params = {"name_val": name_val, "channel_val": channel_val}
    else:
        query = text(f"""
            SELECT {sensor_table}_id FROM {sensor_table} 
            WHERE {name_col} = :name_val
        """)
        params = {"name_val": name_val}
    
    # Execute the query
    with engine.connect() as conn:
        result = conn.execute(query, params)
        row = result.fetchone()
        
    # Return the ID if found
    if row:
        return row[0]
    else:
        logging.warning(f"Sensor not found: {name_val} in {sensor_table}")
        return None

## 4. Sensor Registration Functions

In [None]:
def register_irradiance_sensors(root_dir, engine, pattern=IRRADIANCE_FILE_PATTERN):
    """
    Scan directories for irradiance sensor files and register unique sensors in the database.
    
    Args:
        root_dir: The root directory to start the search from
        engine: SQLAlchemy engine for database connection
        pattern: Regex pattern to match irradiance files
        
    Returns:
        Dictionary with statistics about registered sensors
    """
    stats = {
        'sensors_found': 0,
        'sensors_registered': 0,
        'sensors_existing': 0
    }
    
    # Convert to Path object for better path handling
    root_path = Path(root_dir)
    if not root_path.exists():
        logging.error(f"Root directory does not exist: {root_dir}")
        return stats
    
    # Compile the regex pattern for efficiency
    pattern_compiled = re.compile(pattern)
    
    # Create a DataFrame to store sensor information
    irradiance_sensors = pd.DataFrame(columns=['name', 'channel', 'date_installed', 'location', 'installation_angle'])
    existing_sensors = set()  # Track unique sensor-channel combinations

    # Scan directories for sensor files
    for dirpath, dirnames, filenames in os.walk(root_path):
        path_parts = Path(dirpath).parts
        if any(part.startswith("data") for part in path_parts):
            for filename in filenames:
                match = pattern_compiled.search(filename)
                if match:
                    # Extract sensor name and channel
                    sensor_name = "PT-" + match.group(1)
                    channel = int(match.group(2))  # Ensure channel is an integer
                    sensor_key = (sensor_name, channel)  # Unique key
                    
                    if sensor_key not in existing_sensors:
                        # Create a new sensor entry
                        sensor_data = {
                            'name': sensor_name,
                            'channel': channel,
                            'date_installed': None,  # Would need additional data source
                            'location': None,        # Would need additional data source
                            'installation_angle': None  # Would need additional data source
                        }
                        
                        # Add to DataFrame
                        irradiance_sensors = pd.concat([irradiance_sensors, pd.DataFrame([sensor_data])], ignore_index=True)
                        existing_sensors.add(sensor_key)
                        stats['sensors_found'] += 1

    # Check which sensors already exist in the database
    for _, sensor in irradiance_sensors.iterrows():
        query = text(f"""
            SELECT {IRRADIANCE_SENSOR_TABLE}_id FROM {IRRADIANCE_SENSOR_TABLE} 
            WHERE name = :name AND channel = :channel
        """)
        
        with engine.connect() as conn:
            result = conn.execute(query, {"name": sensor['name'], "channel": sensor['channel']})
            exists = result.fetchone() is not None
            
        if exists:
            stats['sensors_existing'] += 1
        else:
            # Insert new sensor
            try:
                # Convert NaN to None
                sensor_dict = {k: (None if pd.isna(v) else v) for k, v in sensor.items()}
                
                insert_query = text(f"""
                    INSERT INTO {IRRADIANCE_SENSOR_TABLE} (name, channel, date_installed, location, installation_angle)
                    VALUES (:name, :channel, :date_installed, :location, :installation_angle)
                    RETURNING {IRRADIANCE_SENSOR_TABLE}_id
                """)
                
                with engine.connect() as conn:
                    result = conn.execute(insert_query, sensor_dict)
                    conn.commit()
                    stats['sensors_registered'] += 1
                    logging.info(f"Registered irradiance sensor: {sensor['name']} channel {sensor['channel']}")
                    
            except Exception as e:
                logging.error(f"Failed to register sensor {sensor['name']} channel {sensor['channel']}: {str(e)}")
    
    logging.info(f"Found {stats['sensors_found']} unique irradiance sensors")
    logging.info(f"Registered {stats['sensors_registered']} new irradiance sensors")
    logging.info(f"Found {stats['sensors_existing']} existing irradiance sensors")
    
    return stats

def register_temperature_sensors(root_dir, engine, pattern=TEMPERATURE_FILE_PATTERN):
    """
    Scan directories for temperature sensor files and register unique sensors in the database.
    
    Args:
        root_dir: The root directory to start the search from
        engine: SQLAlchemy engine for database connection
        pattern: Regex pattern to match temperature files
        
    Returns:
        Dictionary with statistics about registered sensors
    """
    stats = {
        'sensors_found': 0,
        'sensors_registered': 0,
        'sensors_existing': 0
    }
    
    # Convert to Path object for better path handling
    root_path = Path(root_dir)
    if not root_path.exists():
        logging.error(f"Root directory does not exist: {root_dir}")
        return stats
    
    # Compile the regex pattern for efficiency
    pattern_compiled = re.compile(pattern)
    
    # Create a DataFrame to store sensor information
    temperature_sensors = pd.DataFrame(columns=['device_id', 'date_installed', 'location'])
    existing_sensors = set()  # Track unique sensor IDs

    # Scan directories for sensor files
    for dirpath, dirnames, filenames in os.walk(root_path):
        path_parts = Path(dirpath).parts
        if any(part.startswith("data") for part in path_parts):
            for filename in filenames:
                match = pattern_compiled.search(filename)
                if match:
                    # Extract sensor device ID
                    device_id = match.group(1)
                    
                    if device_id not in existing_sensors:
                        # Create a new sensor entry
                        sensor_data = {
                            'device_id': device_id,
                            'date_installed': None,  # Would need additional data source
                            'location': None         # Would need additional data source
                        }
                        
                        # Add to DataFrame
                        temperature_sensors = pd.concat([temperature_sensors, pd.DataFrame([sensor_data])], ignore_index=True)
                        existing_sensors.add(device_id)
                        stats['sensors_found'] += 1

    # Check which sensors already exist in the database
    for _, sensor in temperature_sensors.iterrows():
        query = text(f"""
            SELECT {TEMPERATURE_SENSOR_TABLE}_id FROM {TEMPERATURE_SENSOR_TABLE} 
            WHERE device_id = :device_id
        """)
        
        with engine.connect() as conn:
            result = conn.execute(query, {"device_id": sensor['device_id']})
            exists = result.fetchone() is not None
            
        if exists:
            stats['sensors_existing'] += 1
        else:
            # Insert new sensor
            try:
                # Convert NaN to None
                sensor_dict = {k: (None if pd.isna(v) else v) for k, v in sensor.items()}
                
                insert_query = text(f"""
                    INSERT INTO {TEMPERATURE_SENSOR_TABLE} (device_id, date_installed, location)
                    VALUES (:device_id, :date_installed, :location)
                    RETURNING {TEMPERATURE_SENSOR_TABLE}_id
                """)
                
                with engine.connect() as conn:
                    result = conn.execute(insert_query, sensor_dict)
                    conn.commit()
                    stats['sensors_registered'] += 1
                    logging.info(f"Registered temperature sensor: {sensor['device_id']}")
                    
            except Exception as e:
                logging.error(f"Failed to register temperature sensor {sensor['device_id']}: {str(e)}")
    
    logging.info(f"Found {stats['sensors_found']} unique temperature sensors")
    logging.info(f"Registered {stats['sensors_registered']} new temperature sensors")
    logging.info(f"Found {stats['sensors_existing']} existing temperature sensors")
    
    return stats

Data written to staging_irradiance_sensor table.


## 5. Data Processing Functions

In [None]:
def process_irradiance_files(root_dir, engine, pattern=IRRADIANCE_FILE_PATTERN, batch_size=BATCH_SIZE, validate=VALIDATE_DATA):
    """
    Process irradiance data files and upload to database.

    Args:
        root_dir: The root directory to start the search from
        engine: SQLAlchemy engine for database connection
        pattern: Regex pattern to match irradiance files
        batch_size: Number of rows to insert in one batch
        validate: Whether to perform data validation
        
    Returns:
        Dictionary with statistics about the processing
    """
    # Statistics to return
    stats = {
        'files_processed': 0,
        'files_skipped': 0,
        'files_error': 0,
        'rows_inserted': 0,
        'start_time': datetime.now(timezone.utc),
        'total_files': 0
    }

    # Convert to Path object for better path handling
    root_path = Path(root_dir)
    if not root_path.exists():
        logging.error(f"Root directory does not exist: {root_dir}")
        return stats

    # Compile the regex pattern for efficiency
    pattern_compiled = re.compile(pattern)

    # First, collect all matching filepaths
    matching_files = []
    for dirpath, dirnames, filenames in os.walk(root_path):
        path_parts = Path(dirpath).parts
        if any(part.startswith("data") for part in path_parts):
            for filename in filenames:
                filepath = Path(dirpath) / filename
                if pattern_compiled.search(filename):
                    matching_files.append(filepath)

    stats['total_files'] = len(matching_files)
    logging.info(f"Found {len(matching_files)} irradiance data files to process")

    # Process files with a progress bar
    with tqdm(total=len(matching_files), desc="Processing Irradiance Files") as pbar:
        for filepath in matching_files:
            try:
                # Extract filename for pattern matching
                filename = filepath.name
                match = pattern_compiled.search(filename)

                if not match:
                    logging.warning(f"Skipping file without proper format: {filepath}")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue

                # Extract sensor name and channel
                sensor_name = "PT-" + match.group(1)
                channel = int(match.group(2))

                # Log processing information
                logging.info(f"Processing irradiance file: {filepath} (Sensor: {sensor_name}, Channel: {channel})")

                # Get the sensor ID from the database
                sensor_id = get_sensor_id(
                    engine,
                    IRRADIANCE_SENSOR_TABLE,
                    'name', sensor_name,
                    'channel', channel
                )

                if not sensor_id:
                    logging.warning(f"Sensor not found: {sensor_name} (Channel: {channel}). Skipping file.")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue

                # Read the file into a pandas DataFrame
                df = pd.read_csv(
                    filepath,
                    sep='\t',
                    names=['timestamp', 'raw_reading', 'irradiance']
                )

                if df.empty:
                    logging.warning(f"Empty file: {filepath}")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue

                # Ensure timestamp is in UTC format
                df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

                # Add sensor ID to the DataFrame
                df['irradiance_sensor_id'] = sensor_id

                # Validate data if enabled
                if validate:
                    df = validate_irradiance_data(df)
                    if df.empty:
                        logging.warning(f"All data filtered during validation: {filepath}")
                        stats['files_skipped'] += 1
                        pbar.update(1)
                        continue

                # Check if last data point exists to avoid duplicates
                last_timestamp = df['timestamp'].iloc[-1]
                data_exists = check_existing_data(
                    engine,
                    IRRADIANCE_MEASUREMENT_TABLE,
                    'irradiance_sensor_id',
                    sensor_id,
                    last_timestamp
                )

                if data_exists:
                    logging.info(f"Data already exists for {filepath}. Skipping file.")
                    stats['files_skipped'] += 1
                else:
                    # Upload data in batches for large files
                    total_rows = len(df)
                    for i in range(0, total_rows, batch_size):
                        batch_df = df.iloc[i:i+batch_size]
                        batch_df.to_sql(
                            IRRADIANCE_MEASUREMENT_TABLE,
                            engine,
                            if_exists='append',
                            index=False
                        )

                    stats['rows_inserted'] += total_rows
                    stats['files_processed'] += 1
                    logging.info(f"Successfully uploaded {total_rows} rows from {filepath}")

                # Clean up
                del df
                pbar.update(1)

            except Exception as e:
                logging.error(f"Error processing {filepath}: {str(e)}")
                stats['files_error'] += 1
                pbar.update(1)

    # Calculate duration
    stats['end_time'] = datetime.now(timezone.utc)
    stats['duration_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds()

    logging.info(f"Processing complete. Processed {stats['files_processed']} files, "
                 f"skipped {stats['files_skipped']} files, "
                 f"errors in {stats['files_error']} files. "
                 f"Inserted {stats['rows_inserted']} data points in {stats['duration_seconds']:.2f} seconds.")

    return stats

def process_temperature_files(root_dir, engine, pattern=TEMPERATURE_FILE_PATTERN, batch_size=BATCH_SIZE, validate=VALIDATE_DATA):
    """
    Process temperature data files and upload to database.

    Args:
        root_dir: The root directory to start the search from
        engine: SQLAlchemy engine for database connection
        pattern: Regex pattern to match temperature files
        batch_size: Number of rows to insert in one batch
        validate: Whether to perform data validation
        
    Returns:
        Dictionary with statistics about the processing
    """
    # Statistics to return
    stats = {
        'files_processed': 0,
        'files_skipped': 0,
        'files_error': 0,
        'rows_inserted': 0,
        'start_time': datetime.now(timezone.utc),
        'total_files': 0
    }

    # Convert to Path object for better path handling
    root_path = Path(root_dir)
    if not root_path.exists():
        logging.error(f"Root directory does not exist: {root_dir}")
        return stats

    # Compile the regex pattern for efficiency
    pattern_compiled = re.compile(pattern)

    # First, collect all matching filepaths
    matching_files = []
    for dirpath, dirnames, filenames in os.walk(root_path):
        path_parts = Path(dirpath).parts
        if any(part.startswith("data") for part in path_parts):
            for filename in filenames:
                filepath = Path(dirpath) / filename
                if pattern_compiled.search(filename):
                    matching_files.append(filepath)

    stats['total_files'] = len(matching_files)
    logging.info(f"Found {len(matching_files)} temperature data files to process")

    # Process files with a progress bar
    with tqdm(total=len(matching_files), desc="Processing Temperature Files") as pbar:
        for filepath in matching_files:
            try:
                # Extract filename for pattern matching
                filename = filepath.name
                match = pattern_compiled.search(filename)

                if not match:
                    logging.warning(f"Skipping file without proper format: {filepath}")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue

                # Extract device ID
                device_id = match.group(1)

                # Log processing information
                logging.info(f"Processing temperature file: {filepath} (Device ID: {device_id})")

                # Get the sensor ID from the database
                sensor_id = get_sensor_id(
                    engine,
                    TEMPERATURE_SENSOR_TABLE,
                    'device_id', device_id
                )

                if not sensor_id:
                    logging.warning(f"Temperature sensor not found: {device_id}. Skipping file.")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue

                # Read the file into a pandas DataFrame
                # Assuming format with timestamp and temperature columns
                df = pd.read_csv(
                    filepath,
                    sep='\t',
                    names=['timestamp', 'temperature']
                )

                if df.empty:
                    logging.warning(f"Empty file: {filepath}")
                    stats['files_skipped'] += 1
                    pbar.update(1)
                    continue

                # Ensure timestamp is in UTC format
                df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

                # Add sensor ID to the DataFrame
                df['temperature_sensor_id'] = sensor_id

                # Validate data if enabled
                if validate:
                    df = validate_temperature_data(df)
                    if df.empty:
                        logging.warning(f"All data filtered during validation: {filepath}")
                        stats['files_skipped'] += 1
                        pbar.update(1)
                        continue

                # Check if last data point exists to avoid duplicates
                last_timestamp = df['timestamp'].iloc[-1]
                data_exists = check_existing_data(
                    engine,
                    TEMPERATURE_MEASUREMENT_TABLE,
                    'temperature_sensor_id',
                    sensor_id,
                    last_timestamp
                )

                if data_exists:
                    logging.info(f"Data already exists for {filepath}. Skipping file.")
                    stats['files_skipped'] += 1
                else:
                    # Upload data in batches for large files
                    total_rows = len(df)
                    for i in range(0, total_rows, batch_size):
                        batch_df = df.iloc[i:i+batch_size]
                        batch_df.to_sql(
                            TEMPERATURE_MEASUREMENT_TABLE,
                            engine,
                            if_exists='append',
                            index=False
                        )

                    stats['rows_inserted'] += total_rows
                    stats['files_processed'] += 1
                    logging.info(f"Successfully uploaded {total_rows} rows from {filepath}")

                # Clean up
                del df
                pbar.update(1)

            except Exception as e:
                logging.error(f"Error processing {filepath}: {str(e)}")
                stats['files_error'] += 1
                pbar.update(1)

    # Calculate duration
    stats['end_time'] = datetime.now(timezone.utc)
    stats['duration_seconds'] = (stats['end_time'] - stats['start_time']).total_seconds()

    logging.info(f"Processing complete. Processed {stats['files_processed']} files, "
                 f"skipped {stats['files_skipped']} files, "
                 f"errors in {stats['files_error']} files. "
                 f"Inserted {stats['rows_inserted']} data points in {stats['duration_seconds']:.2f} seconds.")

    return stats

## 6. Execute the Data Upload Process

In [None]:
# Create database connection
try:
    engine = create_db_connection()
    logging.info("Database connection established successfully")
except Exception as e:
    logging.error(f"Failed to connect to database: {str(e)}")
    raise

Processing Irradiance Files:   0%|          | 0/34 [00:00<?, ?it/s]

Error processing ./data_20240514/data/PT-104_channel_01.txt: (psycopg2.errors.NotNullViolation) null value in column "timestamp" of relation "irradiance_measurement" violates not-null constraint
DETAIL:  Failing row contains (null, 4428827, 92.46, 90527cac-fcb9-41b0-8c3f-375ef83093b3).

[SQL: INSERT INTO irradiance_measurement (timestamp, raw_reading, irradiance, irradiance_sensor_id) VALUES (%(timestamp__0)s, %(raw_reading__0)s, %(irradiance__0)s, %(irradiance_sensor_id__0)s), (%(timestamp__1)s, %(raw_reading__1)s, %(irradiance__1)s, %(i ... 95309 characters truncated ... 8)s), (%(timestamp__999)s, %(raw_reading__999)s, %(irradiance__999)s, %(irradiance_sensor_id__999)s)]
[parameters: {'irradiance_sensor_id__0': UUID('90527cac-fcb9-41b0-8c3f-375ef83093b3'), 'timestamp__0': datetime.datetime(2024, 4, 24, 20, 57, 59, tzinfo=datetime.timezone.utc), 'raw_reading__0': -107844, 'irradiance__0': -2.251, 'irradiance_sensor_id__1': UUID('90527cac-fcb9-41b0-8c3f-375ef83093b3'), 'timestamp__1': 

In [None]:
# Step 1: Register sensors in the database
print(f"Starting sensor registration from directory: {ROOT_DIRECTORY}")
print("\n1. Registering irradiance sensors...")
irr_sensor_stats = register_irradiance_sensors(ROOT_DIRECTORY, engine)

print("\n2. Registering temperature sensors...")
temp_sensor_stats = register_temperature_sensors(ROOT_DIRECTORY, engine)

In [None]:
# Step 2: Process and upload irradiance data
print("\n3. Processing irradiance data files...")
irr_stats = process_irradiance_files(ROOT_DIRECTORY, engine)

In [None]:
# Step 3: Process and upload temperature data
print("\n4. Processing temperature data files...")
temp_stats = process_temperature_files(ROOT_DIRECTORY, engine)

## 7. Results Summary

In [None]:
# Display processing statistics
print("\n===== UPLOAD SUMMARY =====\n")

# Sensor registration summary
print("SENSOR REGISTRATION:")
print(f"- Irradiance Sensors: {irr_sensor_stats.get('sensors_found', 0)} found, "
      f"{irr_sensor_stats.get('sensors_registered', 0)} newly registered, "
      f"{irr_sensor_stats.get('sensors_existing', 0)} already existing")

print(f"- Temperature Sensors: {temp_sensor_stats.get('sensors_found', 0)} found, "
      f"{temp_sensor_stats.get('sensors_registered', 0)} newly registered, "
      f"{temp_sensor_stats.get('sensors_existing', 0)} already existing")

# Data processing summary
print("\nDATA PROCESSING:")
print("Irradiance Data:")
print(f"- Files processed: {irr_stats.get('files_processed', 0)}")
print(f"- Files skipped: {irr_stats.get('files_skipped', 0)}")
print(f"- Files with errors: {irr_stats.get('files_error', 0)}")
print(f"- Rows inserted: {irr_stats.get('rows_inserted', 0)}")
if 'duration_seconds' in irr_stats:
    print(f"- Processing time: {irr_stats['duration_seconds']:.2f} seconds")

print("\nTemperature Data:")
print(f"- Files processed: {temp_stats.get('files_processed', 0)}")
print(f"- Files skipped: {temp_stats.get('files_skipped', 0)}")
print(f"- Files with errors: {temp_stats.get('files_error', 0)}")
print(f"- Rows inserted: {temp_stats.get('rows_inserted', 0)}")
if 'duration_seconds' in temp_stats:
    print(f"- Processing time: {temp_stats['duration_seconds']:.2f} seconds")

# Verify database counts
try:
    print("\nDATABASE VERIFICATION:")
    with engine.connect() as conn:
        # Get irradiance data counts
        result = conn.execute(text(f"SELECT COUNT(*) FROM {IRRADIANCE_MEASUREMENT_TABLE}"))
        irradiance_count = result.scalar()
        print(f"- Total irradiance measurements: {irradiance_count}")
        
        # Get temperature data counts
        result = conn.execute(text(f"SELECT COUNT(*) FROM {TEMPERATURE_MEASUREMENT_TABLE}"))
        temperature_count = result.scalar()
        print(f"- Total temperature measurements: {temperature_count}")
        
        # Get sensor counts
        result = conn.execute(text(f"SELECT COUNT(*) FROM {IRRADIANCE_SENSOR_TABLE}"))
        irradiance_sensor_count = result.scalar()
        print(f"- Total irradiance sensors: {irradiance_sensor_count}")
        
        result = conn.execute(text(f"SELECT COUNT(*) FROM {TEMPERATURE_SENSOR_TABLE}"))
        temperature_sensor_count = result.scalar()
        print(f"- Total temperature sensors: {temperature_sensor_count}")
        
except Exception as e:
    print(f"Could not query database: {str(e)}")

## 8. Next Steps

After successfully uploading the temperature and irradiance data, you might want to:

1. Analyze correlations between environmental conditions and solar cell performance
2. Set up dashboards to visualize temperature and irradiance trends
3. Create reports comparing different sensors and locations
4. Implement automated data quality monitoring

See the other notebooks in this project for additional data processing and analysis examples.