In [30]:
import os
import pickle
from datetime import datetime, timedelta
from collections import Counter
import pandas as pd

def load_cube_from_picklefile(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

def calculate_total_timesteps_per_month(year, month):
    start_date = datetime(year, month, 1)
    if month == 12:
        end_date = datetime(year + 1, 1, 1)
    else:
        end_date = datetime(year, month + 1, 1)

    time_interval = timedelta(minutes=30)
    total_timesteps = 0
    current_time = start_date
    while current_time < end_date:
        total_timesteps += 1
        current_time += time_interval

    return total_timesteps

def make_missing_table_all_years(start_year, end_year, filtering_name):
    combined_table = pd.DataFrame()

    for year in range(start_year, end_year + 1):
        pickle_file_filepath = f"/nfs/a319/gy17m2a/PhD/datadir/cache/nimrod_30mins_2.2km/{filtering_name}/WholeYear/cube_{year}.pkl"

        if not os.path.exists(pickle_file_filepath):
            print(f"File not found for year {year}: {pickle_file_filepath}")
            continue

        # Load the cube
        full_year_cube = load_cube_from_picklefile(pickle_file_filepath)

        # Extract the time coordinates from the cube
        cube_times = full_year_cube.coord('time').units.num2date(full_year_cube.coord('time').points)

        # Generate a complete list of expected time points for the year
        start_time = datetime(year, 1, 1, 0, 12, 30)
        end_time = datetime(year, 12, 31, 23, 42, 30)
        time_interval = timedelta(minutes=30)

        expected_times = []
        current_time = start_time
        while current_time <= end_time:
            expected_times.append(current_time)
            current_time += time_interval

        # Convert to sets for easier comparison
        cube_time_set = set(cube_times)
        expected_time_set = set(expected_times)

        # Find missing times
        missing_times = sorted(expected_time_set - cube_time_set)

        # Calculate total timesteps for each month
        total_timesteps = {f"{year}-{month:02d}": calculate_total_timesteps_per_month(year, month) for month in range(1, 13)}

        # Group the missing times by month
        missing_months = [missing_time.strftime('%Y-%m') for missing_time in missing_times]

        # Count the number of missing values per month
        missing_count_per_month = Counter(missing_months)

        # Convert the result to a DataFrame for better visualization
        missing_table = pd.DataFrame.from_dict(missing_count_per_month, orient='index', columns=['Missing Count'])
        missing_table.index.name = 'Month'
        missing_table = missing_table.sort_index()

        # Add a percentage column to the table
        missing_table['Total Timesteps'] = missing_table.index.map(total_timesteps)
        missing_table['Percentage Missing'] = (missing_table['Missing Count'] / missing_table['Total Timesteps']) * 100

        # Append to the combined table
        combined_table = pd.concat([combined_table, missing_table])

    # Print the combined table
    print("Combined missing values per month for all years with percentages:")
    print(combined_table)
    return combined_table

# Example usage
filtering_name = "filtered_100"  # Replace with your filtering name
combined_missing_table = make_missing_table_all_years(2006, 2020, filtering_name)

Combined missing values per month for all years with percentages:
         Missing Count  Total Timesteps  Percentage Missing
Month                                                      
2006-01             18             1488               1.210
2006-02            483             1344              35.938
2006-03             88             1488               5.914
2006-04            131             1440               9.097
2006-05             12             1488               0.806
...                ...              ...                 ...
2019-11             28             1440               1.944
2019-12              3             1488               0.202
2020-02             53             1392               3.807
2020-03            476             1488              31.989
2020-07             70             1488               4.704

[145 rows x 3 columns]


Month
2006-01    1.210
2006-02   35.938
2006-03    5.914
2006-04    9.097
2006-05    0.806
           ...  
2019-11    1.944
2019-12    0.202
2020-02    3.807
2020-03   31.989
2020-07    4.704
Name: Percentage Missing, Length: 145, dtype: float64

In [37]:
combined_missing_table.to_csv("missing_vals.csv")