In [1]:
from sqlalchemy import create_engine, text
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
!pip install psycopg2-binary
import psycopg2

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


# Upload all mpp data

In [3]:
import os
import re
import pandas as pd
from tqdm.notebook import tqdm

def process_mpp_files(root_dir, engine):
    """
    Crawls directories starting with 'data', finds files matching the pattern,
    reads them into pandas DataFrames, and uploads them to the DB.

    Args:
        root_dir: The root directory to start the search from.
        engine:   SQLAlchemy engine for database connection.
    """

    # Compile the regex pattern for efficiency
    pattern = re.compile(r"output_board(\d+)_channel(\d+)")

    # First, collect all matching filepaths
    matching_files = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if any(dir.startswith("data") for dir in os.path.normpath(dirpath).split(os.sep)):
            for filename in filenames:
                filepath = os.path.join(dirpath, filename)
                if pattern.search(filename):
                    matching_files.append(filepath)

    # Now, process the collected filepaths
    with tqdm(total=len(matching_files), desc="Processing MPP Files") as pbar:
        for filepath in matching_files:
            try:
                # Extract filename for pattern matching (use os.path.basename)
                filename = os.path.basename(filepath)
                match = pattern.search(filename)

                # Extract board and channel numbers
                board = int(match.group(1))
                channel = int(match.group(2))
                # Read the file into a pandas DataFrame
                df = pd.read_csv(filepath, sep='\t',
                                 names=['timestamp', 'power', 'current', 'voltage'])  # Adjust read function if needed
                df.timestamp = pd.to_datetime(df.timestamp, utc=True)  # Ensure UTC timezone

                # Add board and channel information to the DataFrame
                df['tracking_channel_board'] = board
                df['tracking_channel_channel'] = channel

                # --- Check for Existing Data ---
                timestamps_to_check = df['timestamp'].tolist()
                board_id = board
                channel_id = channel

                # --- Check for Existing Data (Last Timestamp Only) ---
                last_timestamp = df['timestamp'].iloc[-1]  # Get the last timestamp

                # Build a query to check for existing data
                query = text("""
                    SELECT 1
                    FROM mpp_measurement
                    WHERE timestamp = :last_timestamp
                      AND tracking_channel_board = :board_id
                      AND tracking_channel_channel = :channel_id
                """)

                # Execute the query
                with engine.connect() as conn:
                    result = conn.execute(query, {
                        "last_timestamp": last_timestamp,
                        "board_id": board,
                        "channel_id": channel
                    })
                    exists = result.fetchone() is not None  # Check if any row was returned

                if not exists:
                    # Upload the entire DataFrame
                    df.to_sql('mpp_measurement', engine, if_exists='append', index=False)


                del df
                pbar.update(1)  # Increment progress bar

            except Exception as e:
                print(f"Error processing {filepath}: {e}")  # Print errors
                pbar.update(1)  # Increment progress bar even on error

    return

In [4]:
engine = create_engine("postgresql://postgres:password@timescaledb:5432/perocube")

In [None]:
root_directory = "."
all_dataframes = process_mpp_files(root_directory, engine)

Processing MPP Files:   0%|          | 0/1897 [00:00<?, ?it/s]