In [10]:
from pathlib import Path
print("CWD:", Path.cwd())
print("Here:", [p.name for p in Path('.').iterdir()])
print("Exists?", Path("service_account.json").exists())


CWD: /home/jovyan/Capstone/Air_Q
Here: ['Untitled.ipynb', '.ipynb_checkpoints', 'graphs_multi', 'relative_plots', 'Zone_1_11am_pm25_vs_dist.png', 'Zone_1_2pm_pm25_vs_dist.png', 'Zone_1_7pm_pm25_vs_dist.png', 'Zone_2_11am_pm25_vs_dist.png', 'zones_by_hour_summary.csv', 'CO2_vs_time.png', 'PM25_vs_time.png', 'AQ_298', 'AQ_389', 'relative_plots_dayviews', 'day_views', 'AQ_205', 'Untitled1.ipynb']
Exists? False


In [11]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "pydrive2", "google-auth-oauthlib", "--quiet"])


0

In [13]:
# ==== Google Drive (Service Account) — Single Source of Truth ====
# Download all PSP*_LOG_*UTC*.txt from your Capstone Drive folder (by FOLDER_ID)
# into a local cache, then run the pipeline on that cache.
from pathlib import Path
print("CWD:", Path.cwd())
print("Here:", [p.name for p in Path('.').iterdir()])
print("Exists?", Path("service_account.json").exists())
from pathlib import Path
from typing import List
import os, sys, subprocess

# --- CONFIG (edit/override as needed) ---
FOLDER_ID = os.getenv("CAPSTONE_FOLDER_ID", "1YNujfIm14j_lMgDodoV8cvQksWZu_gdy")  # your Air_Quality folder ID
SA_FILE   = os.getenv("CAPSTONE_SA_JSON", "service_account.json")                  # path to SA JSON
LOCAL_CACHE = Path("./_gdrive_cache")                                             # local cache for logs
OUTPUT_DIR  = LOCAL_CACHE / "outputs"                                             # where PNG/CSV will be saved

# --- Dependencies ---
try:
    import pydrive2  # noqa: F401
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pydrive2", "--quiet"])

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive

def _login_sa(sa_json_path: str) -> GoogleDrive:
    p = Path(sa_json_path)
    if not p.exists():
        raise FileNotFoundError(
            f"Service account JSON not found: {p}\n"
            "Set env var CAPSTONE_SA_JSON with the full path OR place service_account.json next to this script."
        )
    gauth = GoogleAuth()
    gauth.ServiceAccountCredentials(str(p), scope=["https://www.googleapis.com/auth/drive.readonly"])
    return GoogleDrive(gauth)

def _walk_files(drive, root_id: str):
    FOLDER = "application/vnd.google-apps.folder"
    stack = [root_id]
    while stack:
        pid = stack.pop()
        # folders
        for fld in drive.ListFile({'q': f"'{pid}' in parents and mimeType='{FOLDER}' and trashed=false"}).GetList():
            stack.append(fld['id'])
        # files
        for f in drive.ListFile({'q': f"'{pid}' in parents and mimeType!='{FOLDER}' and trashed=false"}).GetList():
            yield f

def _download_logs(drive, folder_id: str, outdir: Path) -> List[Path]:
    outdir.mkdir(parents=True, exist_ok=True)
    out: List[Path] = []
    for meta in _walk_files(drive, folder_id):
        name = meta.get("name") or meta.get("title")
        if not name:
            continue
        up = name.upper()
        if up.endswith(".TXT") and ("LOG" in up) and ("PSP" in up) and ("UTC" in up):
            p = outdir / name
            if not p.exists():
                drive.CreateFile({'id': meta['id']}).GetContentFile(str(p))
            out.append(p)
    return sorted(out)

# --- Login + download to cache ---
_drive = _login_sa(SA_FILE)
_txt_files_api = _download_logs(_drive, FOLDER_ID, LOCAL_CACHE)
if not _txt_files_api:
    raise SystemExit("No TXT logs found in the Drive folder (by ID).")

# ROOT stays as the local cache; don't overwrite it later
ROOT = LOCAL_CACHE.resolve()

# Make the loader return the downloaded list (do NOT redefine later)
def list_upas_txts(root: Path) -> List[Path]:
    return _txt_files_api
# ==== End Drive header ====


# ---------------------- Your original pipeline (adapted) ----------------------
# (Key change: we DO NOT reassign ROOT here; we use the one from the header.)
from io import StringIO
import re
from typing import Optional, Tuple
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

SAVE_PNG = True
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
PNG_DIR = OUTPUT_DIR / "graphs_multi"
PNG_DIR.mkdir(parents=True, exist_ok=True)
SAVE_CSV = True
CSV_PATH = OUTPUT_DIR / "zones_by_hour_summary.csv"

# ---------------------- Name patterns ----------------------
FNAME_RE = re.compile(
    r"^(?P<prefix>PSP\d+)_LOG_(?P<stamp>\d{4}-\d{2}-\d{2}T\d{2}_\d{2}_\d{2})UTC.*\.txt$",
    re.IGNORECASE,
)
DEVICE_FOLDER_RE = re.compile(r".*\bAQ_(\d+)\b", re.IGNORECASE)
PSP_IN_NAME_RE = re.compile(r"\b(PSP\d{5})\b", re.IGNORECASE)

# ---------------------- Utilities ----------------------
def safe_relpath(p: Path, root: Path) -> str:
    try:
        return str(p.resolve().relative_to(root.resolve()))
    except Exception:
        try:
            return os.path.relpath(p.resolve(), start=root.resolve())
        except Exception:
            return p.name

def read_upas_sample_log(path: Path) -> Tuple[pd.DataFrame, List[str]]:
    text = path.read_text(encoding="utf-8", errors="ignore").splitlines()
    try:
        i = text.index("SAMPLE LOG")
    except ValueError:
        raise RuntimeError(f"'SAMPLE LOG' not found in {path.name}")
    header_idx = None
    for j in range(i + 1, min(i + 25, len(text))):
        if "DateTimeUTC" in text[j]:
            header_idx = j
            break
    if header_idx is None:
        raise RuntimeError(f"Could not find column headers in {path.name}")
    header = text[header_idx]
    units_idx = header_idx + 1
    data_start = units_idx + 1
    csv_buf = header + "\n" + "\n".join(text[data_start:])
    df = pd.read_csv(StringIO(csv_buf))
    df = df.replace([-9999.0, -9999], np.nan)
    return df, text

def device_from_path_or_file(path: Path, lines: List[str]) -> str:
    m = DEVICE_FOLDER_RE.search(str(path.parent))
    if m:
        return m.group(1)
    m = PSP_IN_NAME_RE.search(path.name)
    if m:
        return m.group(1).upper()
    for line in lines[:200]:
        if line.startswith("UPASserial"):
            parts = line.split(",")
            if len(parts) > 1:
                digits = "".join(ch for ch in parts[1] if ch.isdigit())
                return digits or parts[1].strip()
            break
    return path.stem.split("_")[0]

def parse_absolute_time(df: pd.DataFrame) -> pd.Series:
    utc_col = next((c for c in df.columns if "DateTimeUTC" in c), None)
    if utc_col:
        ts = pd.to_datetime(df[utc_col], format="%Y-%m-%dT%H:%M:%S", errors="coerce", utc=True)
        if ts.notna().any():
            return ts
    if "UnixTime" in df.columns:
        ts = pd.to_datetime(pd.to_numeric(df["UnixTime"], errors="coerce"), unit="s", utc=True)
        if ts.notna().any():
            return ts
    loc_col = next((c for c in df.columns if "DateTimeLocal" in c), None)
    if loc_col:
        return pd.to_datetime(df[loc_col], errors="coerce")
    return pd.Series(pd.RangeIndex(len(df)), index=df.index, dtype="int64")

def start_local_time(lines: List[str], df: pd.DataFrame, path: Path) -> Optional[pd.Timestamp]:
    try:
        j = lines.index("SAMPLE SUMMARY")
        for k in range(j, min(j + 80, len(lines))):
            if lines[k].startswith("StartDateTimeLocal,"):
                val = lines[k].split(",")[1].strip()
                dt = pd.to_datetime(val, errors="coerce")
                if pd.notna(dt):
                    return dt
                break
    except ValueError:
        pass
    loc_col = next((c for c in df.columns if "DateTimeLocal" in c), None)
    if loc_col:
        dt = pd.to_datetime(df[loc_col], errors="coerce")
        if dt.notna().any():
            return dt.dropna().iloc[0]
    m = FNAME_RE.match(path.name)
    if m:
        ts = pd.to_datetime(m.group("stamp").replace("_", ":"), errors="coerce")
        if pd.notna(ts):
            return ts
    return None

def hour_bucket_label(dt: Optional[pd.Timestamp]) -> str:
    if dt is None or pd.isna(dt):
        return "?"
    return dt.round("h").strftime("%I %p").lstrip("0").upper()

def date_label(dt: Optional[pd.Timestamp]) -> str:
    if dt is None or pd.isna(dt):
        return "unknown-date"
    return dt.strftime("%Y-%m-%d")

# ---------------------- Load & prepare ----------------------
txt_files = list_upas_txts(ROOT)
print(f"Found {len(txt_files)} TXT files under {ROOT.resolve()}.")

rows = []
summary_rows = []

for path in txt_files:
    try:
        df, lines = read_upas_sample_log(path)
    except Exception as e:
        print(f"Skipping {safe_relpath(path, ROOT)}: {e}")
        continue

    if "PM2_5MC" not in df.columns:
        print(f"Skipping {safe_relpath(path, ROOT)}: PM2_5MC missing.")
        continue

    ts_abs = parse_absolute_time(df)
    pm = pd.to_numeric(df["PM2_5MC"], errors="coerce")

    if ts_abs.notna().any():
        t0 = ts_abs.dropna().iloc[0]
    else:
        print(f"Skipping {safe_relpath(path, ROOT)}: no valid timestamps.")
        continue

    t_rel = (ts_abs - t0)
    t_rel_min = t_rel.dt.total_seconds() / 60.0 if hasattr(t_rel, "dt") else pd.Series(np.nan, index=df.index)

    dev = device_from_path_or_file(path, lines)
    start_loc = start_local_time(lines, df, path)
    dlabel = date_label(start_loc)
    hlabel = hour_bucket_label(start_loc)

    summary_rows.append({
        "file": safe_relpath(path, ROOT),
        "device": dev,
        "date": dlabel,
        "hour_bucket": hlabel,
    })

    ok = pd.Series(t_rel_min).notna() & pm.notna()
    if ok.any():
        rows.append(pd.DataFrame({
            "file": safe_relpath(path, ROOT),
            "device": dev,
            "date": dlabel,
            "hour_bucket": hlabel,
            "t_rel_min": t_rel_min[ok].to_numpy(),
            "PM2_5MC": pm[ok].to_numpy(),
        }))

if not rows:
    raise SystemExit("No valid PM2_5MC + time series found.")

data = pd.concat(rows, ignore_index=True)
summary = pd.DataFrame(summary_rows).drop_duplicates().sort_values(["date", "hour_bucket", "device"])

# ---------------------- Plot by (date, hour) ----------------------
if SAVE_PNG:
    for (date_k, hour_k), g in data.groupby(["date", "hour_bucket"], sort=True):
        plt.figure(figsize=(10, 6))
        for fname, gg in g.groupby("file"):
            dev = gg["device"].iloc[0]
            x = gg["t_rel_min"].to_numpy()
            y = gg["PM2_5MC"].to_numpy()
            if len(x) == 0:
                continue
            order = np.argsort(x)
            x, y = x[order], y[order]
            label = f"{dev} · start ~{hour_k}"
            plt.plot(x, y, label=label, linewidth=1.6)
            plt.plot(x[0], y[0], marker="*", markersize=9)
        plt.xlabel("Relative time from start (min)")
        plt.ylabel("PM2.5 (µg/m³)")
        plt.title(f"PM2.5 vs time (aligned) · {date_k} · ~{hour_k}")
        plt.legend(loc="best", fontsize=9)
        plt.tight_layout()
        outfile = PNG_DIR / f"{date_k}_{hour_k.replace(' ', '')}_pm25_vs_time.png"
        plt.savefig(outfile, dpi=150)
        plt.show()

# ---------------------- Save summary ----------------------
if SAVE_CSV:
    summary.to_csv(CSV_PATH, index=False)
    print(f"\nSummary saved to: {CSV_PATH}")

print("\nSummary:")
print(summary.to_string(index=False))


CWD: /home/jovyan/Capstone/Air_Q
Here: ['Untitled.ipynb', '.ipynb_checkpoints', 'graphs_multi', 'relative_plots', 'Zone_1_11am_pm25_vs_dist.png', 'Zone_1_2pm_pm25_vs_dist.png', 'Zone_1_7pm_pm25_vs_dist.png', 'Zone_2_11am_pm25_vs_dist.png', 'zones_by_hour_summary.csv', 'CO2_vs_time.png', 'PM25_vs_time.png', 'AQ_298', 'AQ_389', 'relative_plots_dayviews', 'day_views', 'AQ_205', 'Untitled1.ipynb']
Exists? False


FileNotFoundError: Service account JSON not found: service_account.json
Set env var CAPSTONE_SA_JSON with the full path OR place service_account.json next to this script.