In [14]:
import os
import pandas as pd

# ==============================
# CONFIG
# ==============================

data_root = '/Volumes/thesisDrive/Permafrost Data TP/daily/Permafrost data/Daily'
years_required = set(range(1979, 2019))  # 1979–2018 inclusive

files_to_check = [
    #"frost.txt",
    "G_deform.txt",
    "ice.txt",
    "liquid.txt",
    "Mean_Soil_Data.txt",
    "moist.txt",
    "temp.txt",
    "water.txt",
    "wflow.txt"
]

# ==============================
# FUNCTION TO EXTRACT YEARS
# ==============================

def extract_years(file_path, filename):
    """
    Extract years from different file formats.

    Handles:
    - Files with 'YR' header somewhere in file
    - G_deform.txt (no header; year is column index 2)
    """

    try:
        # ---- Special case: G_deform (no header row) ----
        if filename == "G_deform.txt":
            df = pd.read_csv(
                file_path,
                sep=r"\s+|\t+",
                engine="python",
                header=None
            )

            if df.shape[1] < 3:
                return None, "Unexpected column structure"

            years = set(
                pd.to_numeric(df.iloc[:, 2], errors="coerce")
                .dropna()
                .astype(int)
            )

            return years, None

        # ---- All other files ----
        with open(file_path, "r", encoding="latin-1") as f:
            lines = f.readlines()

        header_index = None

        # Find first line containing 'YR'
        for i, line in enumerate(lines):
            if "YR" in line:
                header_index = i
                break

        if header_index is None:
            return None, "No YR column found"

        df = pd.read_csv(
            file_path,
            sep=r"\s+|\t+",
            engine="python",
            skiprows=header_index
        )

        # Clean column names (remove quotes, spaces)
        df.columns = df.columns.str.replace('"', '').str.strip()

        if "YR" not in df.columns:
            return None, "YR column not parsed correctly"

        years = set(
            pd.to_numeric(df["YR"], errors="coerce")
            .dropna()
            .astype(int)
        )

        return years, None
    except pd.errors.EmptyDataError:
        return None, "File is empty or corrupted"
    except Exception as e:
        return None, f"Read error: {e}"


# ==============================
# MAIN LOOP
# ==============================

print("\nStarting folder check...\n")
start_from = "100.263346096_38.321391739"  # <-- folder to start from

all_folders = sorted([
    f for f in os.listdir(data_root)
    if os.path.isdir(os.path.join(data_root, f))
])

if start_from in all_folders:
    start_index = all_folders.index(start_from)
    folders_to_process = all_folders[start_index:]
else:
    raise ValueError(f"Start folder '{start_from}' not found.")

i = 0
for folder_name in folders_to_process:
    folder_path = os.path.join(data_root, folder_name)
    if i == 100:
        print(f"Checking folder: {folder_name}")

    if not os.path.isdir(folder_path):
        continue

    missing_files = []
    missing_years_by_file = {}

    for fname in files_to_check:
        file_path = os.path.join(folder_path, fname)

        # 1️⃣ Check missing file
        if not os.path.exists(file_path):
            missing_files.append(fname)
            continue

        # 2️⃣ Extract years
        years_in_file, error = extract_years(file_path, fname)

        if error:
            missing_years_by_file[fname] = error
            continue

        # 3️⃣ Check required years
        if not years_required.issubset(years_in_file):
            missing_years = sorted(years_required - years_in_file)
            missing_years_by_file[fname] = missing_years

    # Only print folders that have issues
    if missing_files or missing_years_by_file:
        print(f"Folder: {folder_name}")

        if missing_files:
            print("  Missing files:")
            for f in missing_files:
                print(f"    - {f}")

        if missing_years_by_file:
            print("  Missing years / errors:")
            for f, issue in missing_years_by_file.items():
                print(f"    - {f}: {issue}")

        print("\n" + "-" * 60 + "\n")
    i += 1

print("Check complete!")



Starting folder check...

Folder: 100.263346096_38.321391739
  Missing files:
    - temp.txt
  Missing years / errors:
    - liquid.txt: Read error: 'utf-8' codec can't decode byte 0xc0 in position 8: invalid start byte
    - Mean_Soil_Data.txt: No YR column found
    - moist.txt: No YR column found
    - water.txt: No YR column found
    - wflow.txt: No YR column found

------------------------------------------------------------

Folder: 100.263346096_38.421391739
  Missing files:
    - G_deform.txt
    - ice.txt
    - liquid.txt
    - Mean_Soil_Data.txt
    - moist.txt
    - temp.txt
    - water.txt
    - wflow.txt

------------------------------------------------------------

Folder: 100.263346096_38.521391739
  Missing years / errors:
    - G_deform.txt: Read error: 'utf-8' codec can't decode byte 0x9d in position 0: invalid start byte
    - ice.txt: No YR column found
    - liquid.txt: [1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 199