In [4]:
from sqlalchemy import create_engine, text
import pandas as pd
from tqdm.notebook import tqdm
!pip install psycopg2-binary
import psycopg2

import os
import re
import pandas as pd
from tqdm.notebook import tqdm



# Populate sensor tables

## Irradiance sensor

In [13]:
import os
import re
import pandas as pd
from sqlalchemy import create_engine

# Compile the regex pattern for efficiency
pattern = re.compile(r"PT-(\d+)_channel_(\d+)")
root_dir = "."

# --- Database Connection Setup (Replace with your details) ---
engine = create_engine("postgresql://postgres:password@timescaledb:5432/perocube")


try:
    # First, collect all matching filepaths
    matching_files = list()
    irradiance_sensor = pd.DataFrame(columns=['name', 'channel', 'date_installed', 'location', 'installation_angle'])
    existing_sensors = set()  # Use a set for efficient checking

    for dirpath, dirnames, filenames in os.walk(root_dir):
        if any(dir.startswith("data") for dir in os.path.normpath(dirpath).split(os.sep)):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                if pattern.search(filename):
                    match = pattern.search(filename)
                    sensor_name = "PT-" + match.group(1)
                    channel = int(match.group(2))  # Ensure channel is an integer
                    sensor_key = (sensor_name, channel)  # Create a tuple as a unique key

                    # Check if the sensor and channel combination already exists
                    if sensor_key not in existing_sensors:
                        # Create a dictionary representing the sensor data
                        sensor_data = {
                            'name': sensor_name,
                            'channel': channel,
                            'date_installed': None,  # You'll need to determine this
                            'location': None,        # You'll need to determine this
                            'installation_angle': None  # You'll need to determine this
                        }

                        # Append the data to the DataFrame
                        irradiance_sensor = pd.concat([irradiance_sensor, pd.DataFrame([sensor_data])], ignore_index=True)

                        # Add the sensor key to the set of existing sensors
                        existing_sensors.add(sensor_key)

    # Now 'irradiance_sensor' DataFrame is populated
    # You'll still need to determine how to populate 'date_installed', 'location', and 'installation_angle'

    # --- Write to the staging table ---
    irradiance_sensor.to_sql('staging_irradiance_sensor', engine, if_exists='append', index=False)

    print("Data written to staging_irradiance_sensor table.")

except Exception as e:
    print("An error occurred:", e)

finally:
    engine.dispose()

Data written to staging_irradiance_sensor table.


In [16]:
def process_irradiance_files(root_dir, engine):
    """
    Crawls directories, finds irradiance files matching the pattern,
    reads them into pandas DataFrames, and uploads them to the DB, skipping existing data.

    Args:
        root_dir: The root directory to start the search from.
        engine:   SQLAlchemy engine for database connection.
    """

    # Compile the regex pattern for efficiency
    pattern = re.compile(r"PT-(\d+)_channel_(\d+)")

    # First, collect all matching filepaths
    matching_files = list()
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if any(dir.startswith("data") for dir in os.path.normpath(dirpath).split(os.sep)):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                if pattern.search(filename):
                    matching_files.append(filepath)

    # Now, process the collected filepaths
    with tqdm(total=len(matching_files), desc="Processing Irradiance Files") as pbar:
        for filepath in matching_files:
            try:
                # Extract filename for pattern matching (use os.path.basename)
                filename = os.path.basename(filepath)
                match = pattern.search(filename)

                # Extract sensor name and channel
                sensor_name = "PT-" + match.group(1)
                channel = int(match.group(2))

                # Read the file into a pandas DataFrame
                df = pd.read_csv(filepath, sep='\t',
                                 names=['timestamp', 'raw_reading', 'irradiance'])  # Updated columns
                df.timestamp = pd.to_datetime(df.timestamp, utc=True)  # Ensure UTC timezone

                # --- Get irradiance_sensor_id ---
                # Since sensors are created, directly fetch the id
                query = text("""
                    SELECT irradiance_sensor_id FROM irradiance_sensor WHERE name = :sensor_name AND channel = :channel
                """)
                with engine.connect() as conn:
                    result = conn.execute(query, {"sensor_name": sensor_name, "channel": channel})
                    sensor_row = result.fetchone()

                    if sensor_row:
                        irradiance_sensor_id = sensor_row[0]
                    else:
                        print(f"Irradiance sensor {sensor_name} channel {channel} not found. Skipping file.")
                        pbar.update(1)
                        continue  # Skip to the next file

                # --- Check for Existing Data (Last Timestamp Only) ---
                last_timestamp = df['timestamp'].iloc[-1]  # Get the last timestamp
                sensor_id = irradiance_sensor_id

                # Build a query to check for existing data
                query = text("""
                    SELECT 1
                    FROM irradiance_measurement
                    WHERE timestamp = :last_timestamp
                      AND irradiance_sensor_id = :sensor_id
                """)

                # Execute the query
                with engine.connect() as conn:
                    result = conn.execute(query, {
                        "last_timestamp": last_timestamp,
                        "sensor_id": sensor_id
                    })
                    exists = result.fetchone() is not None  # Check if any row was returned

                if not exists:
                    # Upload the entire DataFrame
                    df['irradiance_sensor_id'] = irradiance_sensor_id
                    df.to_sql('irradiance_measurement', engine, if_exists='append', index=False)


                del df  # Clean up DataFrame
                pbar.update(1)  # Increment progress bar

            except Exception as e:
                print(f"Error processing {filepath}: {e}")  # Print errors
                pbar.update(1)  # Increment progress bar even on error

    return

In [17]:
# --- Example Usage ---
root_directory = "."  # Replace with the actual root directory you want to start from
engine = create_engine("postgresql://postgres:password@timescaledb:5432/perocube")

try:
    process_irradiance_files(root_directory, engine)
    print("Irradiance files processed and uploaded successfully!")

except Exception as e:
    print("An error occurred during processing:", e)

finally:
    engine.dispose()

Processing Irradiance Files:   0%|          | 0/34 [00:00<?, ?it/s]

Error processing ./data_20240514/data/PT-104_channel_01.txt: (psycopg2.errors.NotNullViolation) null value in column "timestamp" of relation "irradiance_measurement" violates not-null constraint
DETAIL:  Failing row contains (null, 4428827, 92.46, 90527cac-fcb9-41b0-8c3f-375ef83093b3).

[SQL: INSERT INTO irradiance_measurement (timestamp, raw_reading, irradiance, irradiance_sensor_id) VALUES (%(timestamp__0)s, %(raw_reading__0)s, %(irradiance__0)s, %(irradiance_sensor_id__0)s), (%(timestamp__1)s, %(raw_reading__1)s, %(irradiance__1)s, %(i ... 95309 characters truncated ... 8)s), (%(timestamp__999)s, %(raw_reading__999)s, %(irradiance__999)s, %(irradiance_sensor_id__999)s)]
[parameters: {'irradiance_sensor_id__0': UUID('90527cac-fcb9-41b0-8c3f-375ef83093b3'), 'timestamp__0': datetime.datetime(2024, 4, 24, 20, 57, 59, tzinfo=datetime.timezone.utc), 'raw_reading__0': -107844, 'irradiance__0': -2.251, 'irradiance_sensor_id__1': UUID('90527cac-fcb9-41b0-8c3f-375ef83093b3'), 'timestamp__1': 