In [1]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

In [2]:
def parse_sheet_year_month(sheet_name):
    """
    Convert a YYMM sheet name into (year, month).
    Example:
        '1112' -> (2011, 12)
        '1201' -> (2012, 1)
    """
    yy = int(sheet_name[:2])
    mm = int(sheet_name[2:])

    # Years 00â€“49 â†’ 2000s, 50â€“99 â†’ 1900s â€“ adjust if needed
    year = 2000 + yy if yy < 50 else 1900 + yy

    return year, mm

In [7]:
def clean_symbols(df):
    """Remove arrows, parentheses, degrees, and numeric angle values."""
    df = df.replace(r"[â†‘â†“()Â°]", "", regex=True)
    df = df.replace(r"\s*\d+\s*", "", regex=True)  # remove trailing angle numbers
    df = df.map(lambda x: x.strip() if isinstance(x, str) else x)
    return df

In [8]:
def process_single_sheet(df, year, month):
    # First row is junk header; second row is real header
    df.columns = df.iloc[0]          # set second row as headers
    df = df[1:]                      # remove that header row

    # Clean weird symbols
    df = clean_symbols(df)

    # Ensure Day column is numeric
    day_col = df.columns[0]
    df[day_col] = pd.to_numeric(df[day_col], errors="coerce")

    # ðŸ”¥ NEW: Drop rows where day is NaN
    df = df.dropna(subset=[day_col])

    # Build proper Date column
    df["Date"] = pd.to_datetime({
        "year": year,
        "month": month,
        "day": df[day_col].astype(int)
    })

    # Drop old first column
    df = df.drop(columns=[day_col])

    # Reorder Date column first
    cols = ["Date"] + [c for c in df.columns if c != "Date"]
    df = df[cols]

    return df

In [9]:
def load_all_sheets(path):
    xls = pd.ExcelFile(path)
    all_frames = []

    for sheet_name in xls.sheet_names:
        # Skip sheets that aren't year-month format
        if not re.fullmatch(r"\d{4}", sheet_name):
            continue

        year, month = parse_sheet_year_month(sheet_name)

        raw_df = pd.read_excel(path, sheet_name=sheet_name, header=None)

        cleaned = process_single_sheet(raw_df, year, month)
        all_frames.append(cleaned)

    # Combine everything into one DataFrame
    final_df = pd.concat(all_frames, ignore_index=True)
    return final_df

In [10]:
final_df = load_all_sheets("data/Edinburgh-daytime.xlsx")

# Export cleaned dataset
output_path = "data/edinburgh-daytime-cleaned.csv"
final_df.to_csv(output_path, index=False)

print(f"Export complete: {output_path}")
print(final_df.head(10))

Export complete: data/edinburgh-daytime-cleaned.csv
0       Date Sunrise/Sunset NaN NaN       NaN       NaN       NaN      NaN  \
0 2011-12-01              :   :  âˆ’:  18:02:00  17:17:00  16:29:00   147.52   
1 2011-12-02              :   :  âˆ’:  18:01:00  17:16:00  16:28:00  147.495   
2 2011-12-03              :   :  âˆ’:  18:01:00  17:16:00  16:28:00  147.471   
3 2011-12-04              :   :  âˆ’:  18:00:00  17:15:00  16:27:00  147.447   
4 2011-12-05              :   :  âˆ’:  18:00:00  17:15:00  16:27:00  147.425   
5 2011-12-06              :   :  âˆ’:  18:00:00  17:14:00  16:26:00  147.403   
6 2011-12-07              :   :  âˆ’:  17:59:00  17:14:00  16:26:00  147.381   
7 2011-12-08              :   :  âˆ’:  17:59:00  17:14:00  16:25:00  147.361   
8 2011-12-09              :   :  âˆ’:  17:59:00  17:14:00  16:25:00  147.342   
9 2011-12-10              :   :  âˆ’:  17:59:00  17:14:00  16:25:00  147.323   

0 Daylength NaN  ...       NaN       NaN      NaN Solar Noon NaN NaN 