In [158]:
from datetime import date
from typing import NamedTuple, List

class Action(NamedTuple):
    date: date
    action: str

class Run(NamedTuple):
    date: date
    run: int
    precursor: str
    co_reactant: str
    co_absorbate: str | None
    cycles: int
    process: str
    sequence: str
    T: int
    P: int

ACTIONS: List[Action] = []
RUNS: List[Run] = []

In [159]:
from datetime import datetime
import io
from parse import parse, Result, with_pattern
import string
from typing import cast

def parse_experiment_contents(buffer: io.TextIOBase, global_date: date):
    global_T: int
    global_P: int

    buffer.seek(0)
    
    # Register Globals
    _result = find_read_line(buffer, "T{:ws}={:ws}{T:g}{:ws}{T_unit:l}", True)
    if not "T" in _result:
        raise ValueError("Could not find global temperature!")
    global_T = _result["T"]
    
    _result = find_read_line_aliases(buffer, [
        "P{:ws}={:ws}{P:g}{:ws}{P_unit:l}",
        "Pressure{:ws}={:ws}{P:g}{:ws}{P_unit:l}"
    ], True)
    if not "P" in _result:
        raise ValueError("Could not find global pressure!")
    global_P = _result["P"]

    # Actions
    while True:
        _checkpoint_pre_action = buffer.tell()
        if not find_read_separator(buffer, "_", 28):
            raise ValueError("Could not find action section")

        if occurs_before_separator(buffer, "loaded", "_", 28) != -1:
            action_label = str(read_nonws_line(buffer))  # Cannot be None / EOF if condition is true
            if "loaded" in action_label:
                _label_segments = action_label.partition("loaded")

                action_label = _label_segments[0]
                _action_date_str = _label_segments[2].strip()
                
                _result = parse_aliases_to_dict([
                    "{month:d}/{day:d}/{year:d}",
                    "{month:d}/{day:d}",
                    "{year:4d}{month:2d}{day:2d}",
                ], _action_date_str, True)
            else: 
                _result = find_read_line_aliases(buffer, [
                    "loaded{:ws}{month:d}/{day:d}/{year:d}",
                    "loaded{:ws}{month:d}/{day:d}",
                    "loaded{:ws}{year:4d}{month:2d}{day:2d}",
                ], True)
            
            if len(_result) == 0:
                raise ValueError("Failed to parse load date!")
            
            if "year" not in _result:
                _result["year"] = global_date.year

            action_date: date = datetime(
                year=_result["year"],
                month=_result["month"],
                day=_result["day"],
            )
            # Register Action
            this_action = Action(
                date=action_date,
                action=f"{action_label} loaded"
            )
            ACTIONS.append(this_action)
        else:
            # Not an action, likely a run instead
            print(f"{buffer.tell()}: not an action, likely a run instead")
            buffer.seek(_checkpoint_pre_action)
            break

    # Runs
    while True:
        if find_read_separator(buffer, "_", 28) == -1:
            break 

        _result = find_read_line(buffer, "Run{:ws}{number:d}", True)
        if not "number" in _result:
            break
        
        run_number = _result["number"]

        _result = find_read_line_aliases(buffer, [
            "{cycles:d}{:ws}cycles{:ws}{precursor}|{co_reactant}",
            "{cycles:d}{:ws}cycles{:ws}{precursor}{:ws}and{:ws}{co_reactant}{:s}{co_absorbate}",
        ], True)
        run_cycles = _result["cycles"]
        run_precursor = _result["precursor"]
        run_co_reactant = _result["co_reactant"]
        run_co_absorbate = None
        
        if "co_absorbate" in _result:
            run_co_absorbate = _result["co_absorbate"]

        run_process = read_nonws_line(buffer)
        if not run_process:
            raise ValueError(f"Missing run process")
        _run_proc_partitions = run_process.count("|") 

        run_sequence = read_nonws_line(buffer)
        if not run_sequence:
            raise ValueError(f"Missing run sequence")
        _run_seq_partitions = run_sequence.count("|")
        
        if (_run_proc_partitions != _run_seq_partitions):
            raise ValueError(f"Mismatch in partitions between process and run sequence in run #{run_number}")
        
        # Register Run
        this_run = Run(
            date=global_date,
            run=run_number,
            precursor=run_precursor,
            co_reactant=run_co_reactant,
            co_absorbate=run_co_absorbate,
            cycles=run_cycles,
            process=run_process,
            sequence=run_sequence,
            T=global_T,
            P=global_P
        )
        RUNS.append(this_run)
       
def occurs_before_separator(buffer: io.TextIOBase, key: str, tile: str, threshold: int) -> int:
    checkpoint = buffer.tell()

    while True:
        line = read_nonws_line(buffer)
        if not line:
            return -1
        
        if not key in line:
            continue
            
        result = buffer.tell()
        buffer.seek(checkpoint)
        return result

def find_read_separator(buffer: io.TextIOBase, tile: str, threshold: int) -> int:
    threshold_str = tile * (threshold-1)

    result = find_read_line(buffer, threshold_str + "{remainder:w}", True)
    if "__file_position__" not in result:
        return -1
    
    return result["__file_position__"]

def find_read_line(buffer: io.TextIOBase, pattern: str, accept_ws_pad: bool) -> dict:
    return find_read_line_aliases(buffer, [pattern], accept_ws_pad)

def find_read_line_aliases(buffer: io.TextIOBase, patterns: List[str], accept_ws_pad: bool) -> dict:
    checkpoint = buffer.tell()

    while True:
        line = read_nonws_line(buffer)
        if not line:
            print(f"Could not find line: {patterns}")
            buffer.seek(checkpoint)
            return {}

        result_dict = parse_aliases_to_dict(patterns, line.strip(), accept_ws_pad)
        if len(result_dict) == 0:
            continue
        
        result_dict["__file_position__"] = buffer.tell()
        return result_dict

def parse_aliases_to_dict(patterns: List[str], line: str, accept_ws_pad: bool) -> dict:
    for pattern in patterns:
        result = parse_to_dict(pattern, line, accept_ws_pad)
        if len(result) != 0:
            return result
    return {}

def parse_to_dict(pattern: str, line: str, accept_ws_pad: bool) -> dict:
    @with_pattern(r"\s*")
    def whitespace(text):
        return text


    if (accept_ws_pad):
        pattern = "{:ws}" + pattern

    result = parse(pattern, line, extra_types={"ws": whitespace})
    if not result:
        return {}
    
    result = cast(Result, result) 

    formatter = string.Formatter()
    field_names = [fname for _, fname, _, _ in formatter.parse(pattern) if fname]

    # Check that all named fields are non-empty
    if not all(result.named.get(name) not in [None, ""] for name in field_names):
        print(f"A parameter was missing that fit {pattern}")
        return {}

    result_dict = result.named
    result_dict["__match_pattern__"] = pattern
    return result_dict


def read_nonws_line(buffer: io.TextIOBase) -> str | None: 
    checkpoint = buffer.tell()
    
    while True:
        line = buffer.readline()
        if not line:
            buffer.seek(checkpoint)
            return None

        if not line.strip():
            continue

        return line


In [160]:
from datetime import datetime
import os

def parse_experiment_file(path):
    f = open(path, 'r')
    fn = os.path.basename(path)
    date_str = fn[0:8]
    
    experiment_date: date = datetime.strptime(date_str, "%Y%m%d")
    parse_experiment_contents(f, experiment_date)

In [161]:
import pandas as pd

def update_experiment_df(df: pd.DataFrame):
    for action in ACTIONS:
        action_row = pd.DataFrame([{
            'Date': int(action.date.strftime("%Y%m%d")),
            'Run/Action': action.action,
            'Precursor': None,
            'Co-reactant': None,
            'Co-absorbate': None,
            '# Cycles': None,
            'Process': None,
            'Sequence': None,
            'Furnace T (°C)': None,
            'P (Torr)': None,
            'Crystal substrate': None,
        }])

        df = pd.concat([df, action_row], ignore_index=True)

    for run in RUNS:
        _run_co_absorbate = run.co_absorbate
        if not _run_co_absorbate:
            _run_co_absorbate = ""

        run_row = pd.DataFrame([{
            'Date': int(run.date.strftime("%Y%m%d")),
            'Run/Action': run.run,
            'Precursor': run.precursor,
            'Co-reactant': run.co_reactant,
            'Co-absorbate': None,
            '# Cycles': None,
            'Process': f"{run.cycles} cycles {run.process} {_run_co_absorbate}",
            'Sequence': run.sequence,
            'Furnace T (°C)': run.T,
            'P (Torr)': run.P,
            'Crystal substrate': None,
        }])

        df = pd.concat([df, run_row], ignore_index=True)

    df = df.sort_values(by=['Date', 'Run/Action'], ascending=[True, True])
    return df

In [162]:

# parse_experiment_file('data/20240123.txt')
# parse_experiment_file('data/20240124.txt')
# parse_experiment_file('data/20240125.txt')
# parse_experiment_file('data/20220928_notes.txt')

import glob

for filename in glob.glob(os.path.join('data', '*.txt')):
   print(filename)
   parse_experiment_file(filename)

df = pd.DataFrame()
df = update_experiment_df(df)
df.to_excel('output.xlsx', index=False)

data\20220926_notes.txt
1513: not an action, likely a run instead
Could not find line: ['Run{:ws}{number:d}']
data\20220927_notes.txt
1169: not an action, likely a run instead
Could not find line: ['Run{:ws}{number:d}']
data\20220928_notes.txt
1167: not an action, likely a run instead
Could not find line: ['Run{:ws}{number:d}']
data\20220929_notes.txt
1638: not an action, likely a run instead
Could not find line: ['Run{:ws}{number:d}']
data\20220930_notes.txt
1176: not an action, likely a run instead
Could not find line: ['Run{:ws}{number:d}']
data\20240123.txt
947: not an action, likely a run instead
Could not find line: ['___________________________{remainder:w}']
data\20240124.txt
643: not an action, likely a run instead
Could not find line: ['Run{:ws}{number:d}']
data\20240125.txt
965: not an action, likely a run instead
Could not find line: ['___________________________{remainder:w}']


  df = pd.concat([df, run_row], ignore_index=True)
