In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [2]:
def parse_sheet_year_month(sheet_name):
    yy = int(sheet_name[:2])
    mm = int(sheet_name[2:])
    year = 2000 + yy if yy < 50 else 1900 + yy
    return year, mm

In [3]:
def clean_time_string(value):
    if not isinstance(value, str):
        return value

    # Remove arrows, parentheses, degrees
    value = re.sub(r"[↑↓()°]", " ", value)

    # Extract the proper time
    match = re.search(r"\b\d{1,2}:\d{2}(?::\d{2})?\b", value)
    if match:
        return match.group(0)

    return ""

In [4]:
def load_single_sheet(path, sheet_name):
    year, month = parse_sheet_year_month(sheet_name)

    # Use second row as header
    df = pd.read_excel(path, sheet_name=sheet_name, header=1)

    # Drop unnamed/empty columns
    df = df.loc[:, ~df.columns.str.contains("Unnamed")]

    # First column = day number
    day_col = df.columns[0]
    df[day_col] = pd.to_numeric(df[day_col], errors="coerce")
    df = df.dropna(subset=[day_col])        # Remove blank rows

    # Build proper Date column
    df["Date"] = pd.to_datetime({
        "year": year,
        "month": month,
        "day": df[day_col].astype(int)
    })

    df = df.drop(columns=[day_col])  # remove day column

    # Clean string columns
    for col in df.columns:
        if df[col].dtype == object:
            df[col] = df[col].apply(clean_time_string)

    # Place date first
    cols = ["Date"] + [c for c in df.columns if c != "Date"]
    return df[cols]

In [5]:
def load_all_sheets(path):
    xls = pd.ExcelFile(path)
    frames = []

    for sheet_name in xls.sheet_names:
        if re.fullmatch(r"\d{4}", sheet_name):  # e.g. 1112, 1201, 1202
            frames.append(load_single_sheet(path, sheet_name))

    return pd.concat(frames, ignore_index=True)

In [6]:
final_df = load_all_sheets("data/Edinburgh-daytime.xlsx")


In [7]:
# Ensure Date column is datetime
final_df["Date"] = pd.to_datetime(final_df["Date"])

# Index on Date
final_df = final_df.set_index("Date")

# Upsample to per-minute using new syntax
df_1min = final_df.resample("1min").ffill()

# Reset index back into a normal column
df_1min = df_1min.reset_index().rename(columns={"Date": "Timestamp"})

# Export to CSV
df_1min.to_csv("data/edinburgh-daytime-perminute.csv", index=False)