Import relevant Libaries 

In [1]:
# Base libraries
import pandas as pd
import numpy as np
import os
import math as math
import datetime
from scipy import stats
import prettytable as pr

# Visualizations
import plotly.express as px
import plotly.graph_objects as go

**Helper Functions**

In [27]:
import glob

def _get_data(source):
    # Define the directory path where the CSV files are located
    if source == 'GC_net':
        input_path =r'../data/gc_net/daily_data/'
    elif source == 'promice':
        input_path =r'../data/promice/new_promice/daily_data/'
    else: 
        raise ValueError("Only 'GC Net' & 'Promice' are accepted input values")
    
    pattern = '*.csv'
    #staion_list = ['SCO_L.csv','KAN_L.csv']

    # Use glob to get a list of all files that match the pattern
    filenames = glob.glob(input_path + pattern)

    data_collection = []
    for file in filenames:
        data = pd.read_csv(file, index_col=False)
        data_collection.append(data)
    data = pd.concat(data_collection)

    data = data.reset_index(drop=True)
    return data
df = _get_data('promice')
data = _get_data('promice')
def _load_data(data):
    df_out = df.copy()
    
    return df_out


def _align_GC_PR():
    station = "stid"
    datetime = "Datetime"
    dayofcentury = "DayOfCentury"
    dayofyear = 'DayOfYear'
    return station,datetime,dayofcentury,dayofyear

def _exclude():
    # List of columns to exclude from percentile calculation
    exclude = ['Year', 'MonthOfYear', 'DayOfMonth', 'HourOfDay(UTC)', 
               'DayOfYear', 'LongitudeGPS(degW)','HeightStakes(m)',
               'DayOfCentury', 'WindDirection(d)', 'TiltToEast(d)', 
               'TiltToNorth(d)', 'TimeGPS(hhmmssUTC)', 'LatitudeGPS(degN)', 
               'ElevationGPS(m)', 'HorDilOfPrecGPS', 'LoggerTemperature(C)',
               'FanCurrent(mA)', 'BatteryVoltage(V)', 'Month', 'Day', 'Hour',

              'air_temperature_1_max', 'air_temperature_1_min',
              'wind_speed_u1_max','wind_speed_u2_max',
              'wind_from_direction_1', 'wind_from_direction_2', 
              'height_wind_sensor_1', 'height_wind_sensor_2', 'battery_voltage',
              'shortwave_incoming_radiation_max',
              'shortwave_incoming_radiation_stdev', 'net_radiation_stdev',
              'air_temperature_2_max', 'air_temperature_2_min', 
              'wind_speed_u2_stdev', 'ref_temperature',   'wind_speed_u1_stdev',
              'net_radiation_maximum', 'season', 'year', 'month', 'DayOfYear',
              'DayOfCentury', 'Unnamed: 0']
    return exclude

def _subset_df(date, df, station, datetime, measurement, dayofcentury, dayofyear, aws):
    exclude = _exclude()
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='%Y-%m-%d')
    # Split date input into year, month, day
    year = int(date[0:4])
    month = int(date[5:7])
    day = int(date[8:10])



    date_df = df.loc[(df['Datetime'].dt.year == year) & (df['Datetime'].dt.month == month) & (df['Datetime'].dt.day == day)]

    # Find the day of year
    day =  date_df["DayOfYear"].mean()
    
    # find day of century
    day_century = date_df[dayofcentury].mean() 
    
    #Create a list of relevant columns
    columns = df.select_dtypes(include=[np.number]).columns.difference(exclude)

    ## Subset by measurement
        #df = df[[station,datetime, measurement, dayofcentury,dayofyear]]
    if measurement == "All":
        df = df
    elif measurement in columns:
        # subset by measurement
        df = df[[station,datetime, measurement, dayofcentury, dayofyear, 'year']]
        #Update list of relevant column
        columns = df.select_dtypes(include=[np.number]).columns.difference(exclude)
    else: 
        raise ValueError(f"The input for the variable 'measurement' was not recognizable. Please use one of the following options: {columns}")
    
    ## Subset by station
        #df = df.loc[df[station] == aws]
    #Create a list of unique files (stations) from the dataset 
    unique_files = list(df[station].unique())

    if aws == "All":
        df = df
    elif aws in unique_files:
        # filter by aws
        df = df.loc[df[station] == aws]
        #Update list of unique files (stations) from the dataset 
        unique_files = list(df[station].unique())
        #display(df)
    else: 
        raise ValueError(f"The input for the variable 'aws' was not recognizable. Please use one of the following options: {unique_files}")
    
    return day, df,columns, day_century, unique_files

def _subset_scope(scope, window, df,date, day):
    
    def summarize_df(df,min_values , window):
        #Number by observation period (avoids classifying on year if 30 day period spans across Dec + Jan)
        df['group_number'] = (df
                              .groupby('file',group_keys=False)['Datetime']
                              .apply(lambda x: (x - x.shift(1)).fillna(pd.Timedelta(seconds=0)).dt.days.gt(2).cumsum())
        )
        if window == 'year':
            df['group_number'] = df['year'].astype(int).copy()
            
        grouped = df.groupby(['file', 'group_number'], group_keys=False)
        
        #Adding column so all grouped observations has same year
        df['year'] = grouped['Datetime'].transform(lambda x: x.max().year)

        df['target_date'] = grouped['Datetime'].transform(lambda x: (x ==date).max())
        #display(df.head(20))
        # Specify columns containing numerical values to be averaged
        columns_to_average = (df
            .select_dtypes(exclude=['object'])
            .drop(columns=['Datetime', 'DayOfYear', 'DayOfCentury', 'group_number', 'year', 'target_date'])
            .columns)
        
        # Calculate the number of non-NaN values for each variable within each group
        valid_date_observations = grouped[columns_to_average].apply(lambda x: x.notna().sum() <= min_values)

        # Calculate average per day per station
        df_filtered = grouped[columns_to_average].mean()

        #Remove means with less than 20 observations per day
        df_masked = df_filtered.mask(valid_date_observations, np.nan)

        df_masked =  df_masked.merge(df[['file','group_number','year', 'target_date']].drop_duplicates(), how='left',on=['file','group_number'])
        
        df = df_masked.reset_index(drop=True).drop('group_number',axis=1)
        return df

    #set the range of days
    week = datetime.datetime.strptime(date,'%Y-%m-%d').date().isocalendar()[1]
    month = datetime.datetime.strptime(date,'%Y-%m-%d').month
    sliding_7_day = [x+366 if x < 0 else x for x in np.arange(day-6,day+1).tolist()] 
    sliding_30_day = [x+366 if x < 0 else x for x in np.arange(day-29, day+1).tolist()]
    year = [x for x in np.arange(0, day+1).tolist()]
    
    #exclude data from after the given date
    #df = df.loc[df['Datetime'] <= date]
    
    if scope == "Relative":
        if window == 'day':
        # filter by calender day
            df = df.loc[df['DayOfYear'] == day]
            df = summarize_df(df, 0, window)

        elif window == 'week':
            df = df.loc[df['Datetime'].dt.isocalendar().week.astype(int) == week]
            df = summarize_df(df, 5, window)
        
        elif window == 'sliding_avg_7':
            df = df.loc[df['DayOfYear'].isin(sliding_7_day)]
            df = summarize_df(df, 5, window)
            
        elif window == 'month':
            df = df.loc[(df['Datetime'].dt.month == month)]
            df = summarize_df(df, 24, window)

        elif window == 'sliding_avg_30':
            df = df.loc[df['DayOfYear'].isin(sliding_30_day)]
            df = summarize_df(df, 24, window)
  
        elif window == 'year':
            df = df.loc[df['DayOfYear'].isin(year)]
            df = summarize_df(df, day*0.70, window='year')

        else:
            raise ValueError(f"The input for the variable 'window' was not recognizable. Please use one of the following options: 'day', 'week', 'month', 'year', 'sliding_avg_7' or 'sliding_avg30'")

    elif scope == "Absolute":
        df = df
    else: 
        raise ValueError("The input for the variable 'scope' was not recognizable. Please use one of the following options: 'Relative' , 'Absolute'")
    return df

def _percentiles(df, unique_files, station, columns):

        #Create an empty list to hold the percentile values
    percentiles = []

    print("Calculating Percentiles .... ")
    #Loop through each file in the unique_files list
    for i, file in enumerate(unique_files):
        
        #Calculate the percentile of each numerical column for the specified datetime
        df_file = df[df[station] == file]
        #display(df_file)
        #Create an empty dictionary
        percentile_dict = {}
        for col in columns:
          #Looping through each row of the dataframe
          for index, row in df_file.iterrows():
            #row_date = row[dayofcentury]
            #print('row_date',row_date,'DAY_CENTURY', day_century, 'FILE:', file)
            if row['target_date'] == True and ~ np.isnan(row[col]):
            #if ~ np.isnan(row[col]):
                # Retrieving Value 
                #print('passed')
                value = row[col]
                
                # Excluding NAN's for the calculation
                col_list = df_file[col].dropna().values.tolist()
                
                # Calculate the Percentiles
                percentile = stats.percentileofscore(col_list,value, kind = "mean")
                #Count the number of values 
                count = len(col_list)
                #Median of the number of values 
                median = np.nanmedian(col_list, axis=0)
            
                # Assign file, value and, percentile to dictionary
                percentile_dict[col] = row[col]
                percentile_dict[f"{col}_pcte"] = percentile
                percentile_dict[f"{col}_n"] = count
                percentile_dict[f"{col}_median"] = median
                
        percentile_dict["Station"] = file

          #Add the percentile dictionary to the list
        percentiles.append({'Station': file,**percentile_dict})
    
    print("Finished Calculating Percentiles")
    
    #Create a dataframe from the list of dictionaries
    percentiles_df = pd.DataFrame(percentiles)

    print("Transforming Output...")

    return percentiles_df

def _transform_percentiles(percentiles_df):
    
    # Define a list of all the columns in the original dataframe
    columns_list = percentiles_df.columns
    
    # Split the list into three parts based on which columns have '_pcte', '_n', _median in the name
    century_list = [i for i in columns_list if '_pcte' in i]
    number_list = [i for i in columns_list if '_n' in i]
    median_list = [i for i in columns_list if '_median' in i]
    
    # Select the columns which do not have '_pcte' and '_n'
    value_list = [i for i in columns_list if i not in century_list and i not in number_list and i not in median_list and i not in "Station"] 
    # Build the new dataframe from the lists
    transformed_df = pd.DataFrame(columns=['Station', 'Measurement', 'Percentile', 'Number of Comparison Values', 'Original Value', 'Median'])
    # Loop through each entry in the original dataframe
    for row in percentiles_df.iterrows():
        # Take the Station value and loop through all of the remaining values
        station_val = row[1]['Station']
        for value, century, number, median in zip(value_list, century_list, number_list,median_list):
            # Create a new entry for the transformed_df
            new_entry = [station_val, value, row[1][century], row[1][number], row[1][value], row[1][median]]
            transformed_df.loc[len(transformed_df)] = new_entry
     
    # Filter out extreme values       
    transformed_df = transformed_df[(transformed_df["Percentile"] > 90) | (transformed_df["Percentile"] < 10)].reset_index(drop=True)
    
    return transformed_df

**visualization**

In [4]:
start_date = df['Datetime'].min()
end_date= df['Datetime'].max()

# Generate the date range with the first day of each month
last_day_of_year_range = pd.date_range(f"{start_date}", f"{end_date}", freq='A-DEC')
print(last_day_of_year_range)

for i in last_day_of_year_range:
    #generate annual figures
    #somecode
    pass

first_day_date_range = pd.date_range(start_date, end_date, freq='MS')
last_day_of_month_range = first_day_date_range - pd.Timedelta(days=1)
for i in last_day_of_month_range:
    #generate monthly figures
    #somecode
    pass

date_range = pd.date_range(start_date, end_date, freq='W-SUN')


print(last_day_of_year_range)

print(first_day_date_range)

DatetimeIndex(['1990-12-31', '1991-12-31', '1992-12-31', '1993-12-31',
               '1994-12-31', '1995-12-31', '1996-12-31', '1997-12-31',
               '1998-12-31', '1999-12-31', '2000-12-31', '2001-12-31',
               '2002-12-31', '2003-12-31', '2004-12-31', '2005-12-31',
               '2006-12-31', '2007-12-31', '2008-12-31', '2009-12-31',
               '2010-12-31', '2011-12-31', '2012-12-31', '2013-12-31',
               '2014-12-31', '2015-12-31', '2016-12-31', '2017-12-31',
               '2018-12-31', '2019-12-31', '2020-12-31', '2021-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')
DatetimeIndex(['1990-12-31', '1991-12-31', '1992-12-31', '1993-12-31',
               '1994-12-31', '1995-12-31', '1996-12-31', '1997-12-31',
               '1998-12-31', '1999-12-31', '2000-12-31', '2001-12-31',
               '2002-12-31', '2003-12-31', '2004-12-31', '2005-12-31',
               '2006-12-31', '2007-12-31', '2008-12-31', '2009-12-31',
               '2010-12-

**Report Functions**

In [5]:
def get_data(data, measurement, aws, date = datetime.datetime.today().strftime('%Y-%m-%d') 
                   ,scope = "Relative", window = 'day', output = "Report"):
    """
    Function to return the underlying dataset of specified values given a selected date.
    
    Parameters:
    data (str): The dataset to be used.
    date (str): The date of the observations for which the percentile is calculated.
    aws (str): The automatic weather station.
    measurement (str): The measurement. 
    scope (str): Relative: values are compared to historical values of the day of year. 
                 Absolute: values are compared to historical values.

    output (str): Output as pd.df or as printed report. 
  
    """
    
    # Load Data
    df = _load_data(data)
    
    # Align GC Net & PROMICE Columns  
    station, datetime, dayofcentury, dayofyear = _align_GC_PR()
    
    # Subset Data (date, measurement, station)
    day, df,columns, day_century, unique_files = _subset_df(date,df, station, datetime, measurement, dayofcentury, dayofyear, aws)
    
    # Subset Data (Scope)
    df = _subset_scope(scope, window, df, date, day)
    
    df = df.reset_index(drop=True).drop('target_date', axis=1)

    ##### Output #######
    if output == "Report":
        from tabulate import tabulate
        #### Report OUTPUT ###########
        print(
          f"  Selected Date: {date} \n" ,
          f"Selected Station: {aws} \n" , 
          f"Selected Measurement: {measurement} \n"  , 
          f"Selected Data: {data} \n" ,
          f"Selected Scope: {scope} \n" ,
          "----------------------------------------------------------------------------------------------------------------------\n",
          f"                   Climatology Report\n" 
          )        
        print(tabulate(df, headers='keys', tablefmt='psql'))
        #return df

    elif output == "Data":
        return df
    else: 
        raise ValueError("The input for the variable 'output' was not recognizable. Please use one of the following options: 'Report', 'Data'" )


    


In [6]:
def daily_report(data, date = datetime.datetime.today().strftime('%Y-%m-%d'), 
                 aws = "All", measurement = "All", scope = "Relative", window = 'day', output = "Report", aggregated = False):
    """
    Function to return the percentile of specified values given a selected date.
    
    Parameters:
    data (str): The dataset to be used.
    date (str): The date of the observations for which the percentile is calculated.
    aws (str): The automatic weather station.
    measurement (str): The measurement. 
    scope (str): Relative: values are compared to historical values of the day of year. 
                 Absolute: values are compared to historical values.
    aggregated (bool): False: output is not aggregated.
                       True: averages of stations are calculated.

    output (str): Output as pd.df or as printed report. 
  
    """
    # Load Data
    df = _load_data(data)
    
    # Align GC Net & PROMICE Columns  
    station, datetime, dayofcentury, dayofyear = _align_GC_PR()
 
    #Account for Aggregate variable
    if aggregated == False:
        df = df
    elif aggregated == True:
        # Group df by station 
        df = df ##### TBD #######
    else: 
        raise ValueError(f"The input for the variable 'aggregated' was not recognizable. Please use one of the following options: True, False")

    # Subset Data (date, measurement, station)
    #day, df = _subset_df(date,df, station, datetime, measurement, dayofcentury, dayofyear, aws)
    day, df,columns, day_century, unique_files = _subset_df(date, df, station, datetime, measurement, dayofcentury, dayofyear, aws)
    
    # Subset Data (Scope)
    df = _subset_scope(scope, window, df, date, day)
    
    # Calculate Percentiles
    percentiles_df = _percentiles(df, unique_files, station, columns)
  
    # Transform Output
    transformed_df = _transform_percentiles(percentiles_df)
    
    print("Finished")
    print("----------------------------------------------------------------------------------------------------------------------\n")

   # print(tabulate(percentiles_df, headers='keys', tablefmt='psql'))
   # return percentiles_df
    if output == "Report":
        from tabulate import tabulate
        #### Report OUTPUT ###########
        print(
          f"  Selected Date: {date} \n" ,
          f"Selected Station: {aws} \n" , 
          f"Selected Measurement: {measurement} \n"  , 
          f"Selected Data: {data} \n" ,
          f"Selected Scope: {scope} \n" ,
          "----------------------------------------------------------------------------------------------------------------------\n",
          f"                   Climatology Report\n" 
          )        
        print(tabulate(transformed_df, headers='keys', tablefmt='psql'))
       
        input_ = input("Do you want to include graphics? (Y/N)")
        if input_ == "Y":
          ########## TBD: Replace with boxplot function #################
          print("Functionality is in development")
   
    elif output == "Data":
        print()
        return transformed_df
    else: 
        raise ValueError("The input for the variable 'output' was not recognizable. Please use one of the following options: 'Report', 'Data'" )

In [7]:
daily_report(data = "GC Net", date = "2019-01-13", measurement= "All", scope='Relative', aws="All", window = 'week', output = "Data")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['group_number'] = (df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = grouped['Datetime'].transform(lambda x: x.max().year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target_date'] = grouped['Datetime'].transform(lambda x: (x ==date).max())


Calculating Percentiles .... 
Finished Calculating Percentiles
Transforming Output...
Finished
----------------------------------------------------------------------------------------------------------------------




Unnamed: 0,Station,Measurement,Percentile,Number of Comparison Values,Original Value,Median
0,Saddle,air_pressure,92.857143,21.0,732.732857,722.094286
1,Saddle,relative_humidity_1,92.857143,21.0,93.227143,75.432857
2,Saddle,relative_humidity_1_cor,92.857143,21.0,118.825714,99.514286
3,Saddle,snow_depth_2,91.666667,18.0,19.721429,11.287857
4,South Dome,air_temperature_2,92.5,20.0,-22.34,-28.014286
5,South Dome,relative_humidity_1_cor,97.5,20.0,117.535714,101.531429
6,South Dome,shortwave_incoming_radiation,91.666667,18.0,5.965714,4.651905
7,South Dome,specific_humidity_1,93.75,8.0,0.875714,0.493571
8,South Dome,relative_humidity_2,91.666667,18.0,85.607143,77.379286
9,South Dome,relative_humidity_2_cor,97.222222,18.0,106.274286,97.458571


In [8]:
x = get_data(data = "GC Net", date = "2019-01-13", measurement= "relative_humidity_1", scope='Relative', output = "Data", aws="Swiss Camp 10m", window='day')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['group_number'] = (df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = grouped['Datetime'].transform(lambda x: x.max().year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target_date'] = grouped['Datetime'].transform(lambda x: (x ==date).max())


<h2>Extreme Weather Events Detection</h2>
<p>Meterological outliers for 2019-01-13, measured as mean of week compared to similar period previous years</p>

<hr />

<h3>Outliers by percentiler (filtered top/bottom 10%)</h3>

|    | Station        | Measurement                  |   Percentile |   Number of Comparison Values |   Original Value |        Median |
|---:|:---------------|:-----------------------------|-------------:|------------------------------:|-----------------:|--------------:|
|  0 | DYE2           | relative_humidity_1_cor      |      7.5     |                            20 |       91.2029    |  95.4807      |
|  1 | DYE2           | snow_depth_1                 |     97.5     |                            20 |       13.38      |   8.96643     |
|  2 | DYE2           | snow_depth_2                 |     97.5     |                            20 |       15.2043    |  10.4579      |
|  3 | NASA-U         | shortwave_incoming_radiation |     90.625   |                            16 |        0.0214286 |   0.000714286 |
|  4 | NASA-U         | shortwave_outgoing_radiation |     97.0588  |                            17 |        0.0128571 |   0           |
|  5 | NASA-U         | snow_depth_1                 |     96.6667  |                            15 |       18.3814    |   9.90571     |
|  6 | NASA-U         | snow_depth_2                 |     96.6667  |                            15 |       18.0643    |   9.30429     |
|  7 | South Dome     | air_temperature_2            |     91.6667  |                            18 |      -22.34      | -28.0143      |
|  8 | South Dome     | relative_humidity_1          |     91.6667  |                            18 |       93.7871    |  76.4064      |
|  9 | South Dome     | relative_humidity_1_cor      |     97.2222  |                            18 |      117.536     | 101.948       |
| 10 | South Dome     | shortwave_incoming_radiation |     90.625   |                            16 |        5.96571   |   4.63048     |
| 11 | South Dome     | specific_humidity_1          |     92.8571  |                             7 |        0.875714  |   0.495714    |
| 12 | South Dome     | relative_humidity_2          |     90.625   |                            16 |       85.6071    |  77.3793      |
| 13 | South Dome     | relative_humidity_2_cor      |     96.875   |                            16 |      106.274     |  99.8864      |
| 14 | South Dome     | specific_humidity_2          |     91.6667  |                             6 |        0.85      |   0.581429    |
| 15 | Saddle         | air_pressure                 |     92.1053  |                            19 |      732.733     | 722.094       |
| 16 | Saddle         | air_temperature_2            |     92.5     |                            20 |      -24.9329    | -30.3336      |
| 17 | Saddle         | relative_humidity_1          |     97.3684  |                            19 |       93.2271    |  75.4329      |
| 18 | Saddle         | relative_humidity_1_cor      |     97.3684  |                            19 |      118.826     |  97.5843      |
| 19 | Saddle         | snow_depth_1                 |     91.1765  |                            17 |       18.3371    |  10.1514      |
| 20 | Saddle         | snow_depth_2                 |     97.0588  |                            17 |       19.7214    |  10.6814      |
| 21 | NASA-SE        | shortwave_incoming_radiation |     90.625   |                            16 |        2.86143   |   1.70429     |
| 22 | NASA-SE        | snow_depth_1                 |     96.4286  |                            14 |       33.33      |  10.0693      |
| 23 | NASA-SE        | snow_depth_2                 |     96.4286  |                            14 |       31.76      |  11.1343      |
| 24 | NASA-SE        | relative_humidity_2_cor      |     96.6667  |                            15 |      112.553     | 100.831       |
| 25 | Swiss Camp 10m | relative_humidity_2_cor      |     90.625   |                            16 |       88.5957    |  81.6657      |
| 26 | Tunu-N         | shortwave_incoming_radiation |     96.1538  |                            13 |        0.127143  |   0.0485714   |
| 27 | Tunu-N         | snow_depth_1                 |     96.4286  |                            14 |        7.81286   |   3.25571     |
| 28 | Tunu-N         | snow_depth_2                 |     96.4286  |                            14 |        7.54      |   3.41571     |
| 29 | Swiss Camp     | relative_humidity_1_cor      |     91.1765  |                            17 |      109.02      | 100.726       |
| 30 | Swiss Camp     | relative_humidity_2_cor      |     96.875   |                            16 |      115.697     | 103.655       |
| 31 | Swiss Camp     | snow_temperature_10          |     91.1765  |                            17 |       -7.90857   |  -9.63714     |
| 32 | Swiss Camp     | snow_temperature_4           |     96.875   |                            16 |       -4.66857   |  -8.19357     |
| 33 | Swiss Camp     | snow_temperature_6           |     91.1765  |                            17 |       -5.19      |  -8.11714     |
| 34 | Swiss Camp     | snow_temperature_7           |     90.625   |                            16 |       -5.68429   |  -8.19286     |
| 35 | Swiss Camp     | snow_temperature_8           |     96.6667  |                            15 |       -4.57143   |  -8.52857     |
| 36 | Swiss Camp     | snow_temperature_9           |     90.625   |                            16 |       -7.11857   |  -8.59        |
| 37 | NEEM           | air_temperature_1            |      5       |                            10 |      -45.6529    | -36.1943      |
| 38 | NEEM           | air_temperature_2            |      5       |                            10 |      -45.0371    | -35.9807      |
| 39 | NEEM           | relative_humidity_2          |      5.55556 |                             9 |       58.2257    |  68.7543      |
| 40 | NEEM           | relative_humidity_2_cor      |      5.55556 |                             9 |       88.9786    |  97.0271      |

<details> <summary>Station: DYE2, Measurement: relative_humidity_1_cor</summary> 
 <br/> <pre>|    | Station   | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
|  0 | DYE2      | relative_humidity_1_cor |          7.5 |                            20 |          91.2029 |  95.4807 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/DYE2_relative_humidity_1_cor.png" /></p> 
 </details>

<details> <summary>Station: DYE2, Measurement: snow_depth_1</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
|  1 | DYE2      | snow_depth_1  |         97.5 |                            20 |            13.38 |  8.96643 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/DYE2_snow_depth_1.png" /></p> 
 </details>

<details> <summary>Station: DYE2, Measurement: snow_depth_2</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
|  2 | DYE2      | snow_depth_2  |         97.5 |                            20 |          15.2043 |  10.4579 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/DYE2_snow_depth_2.png" /></p> 
 </details>

<details> <summary>Station: NASA-SE, Measurement: relative_humidity_2_cor</summary> 
 <br/> <pre>|    | Station   | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 24 | NASA-SE   | relative_humidity_2_cor |      96.6667 |                            15 |          112.553 |  100.831 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-SE_relative_humidity_2_cor.png" /></p> 
 </details>

<details> <summary>Station: NASA-SE, Measurement: shortwave_incoming_radiation</summary> 
 <br/> <pre>|    | Station   | Measurement                  |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:-----------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 21 | NASA-SE   | shortwave_incoming_radiation |       90.625 |                            16 |          2.86143 |  1.70429 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-SE_shortwave_incoming_radiation.png" /></p> 
 </details>

<details> <summary>Station: NASA-SE, Measurement: snow_depth_1</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
| 22 | NASA-SE   | snow_depth_1  |      96.4286 |                            14 |            33.33 |  10.0693 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-SE_snow_depth_1.png" /></p> 
 </details>

<details> <summary>Station: NASA-SE, Measurement: snow_depth_2</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
| 23 | NASA-SE   | snow_depth_2  |      96.4286 |                            14 |            31.76 |  11.1343 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-SE_snow_depth_2.png" /></p> 
 </details>

<details> <summary>Station: NASA-U, Measurement: shortwave_incoming_radiation</summary> 
 <br/> <pre>|    | Station   | Measurement                  |   Percentile |   Number of Comparison Values |   Original Value |      Median |
|---:|:----------|:-----------------------------|-------------:|------------------------------:|-----------------:|------------:|
|  3 | NASA-U    | shortwave_incoming_radiation |       90.625 |                            16 |        0.0214286 | 0.000714286 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-U_shortwave_incoming_radiation.png" /></p> 
 </details>

<details> <summary>Station: NASA-U, Measurement: shortwave_outgoing_radiation</summary> 
 <br/> <pre>|    | Station   | Measurement                  |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:-----------------------------|-------------:|------------------------------:|-----------------:|---------:|
|  4 | NASA-U    | shortwave_outgoing_radiation |      97.0588 |                            17 |        0.0128571 |        0 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-U_shortwave_outgoing_radiation.png" /></p> 
 </details>

<details> <summary>Station: NASA-U, Measurement: snow_depth_1</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
|  5 | NASA-U    | snow_depth_1  |      96.6667 |                            15 |          18.3814 |  9.90571 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-U_snow_depth_1.png" /></p> 
 </details>

<details> <summary>Station: NASA-U, Measurement: snow_depth_2</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
|  6 | NASA-U    | snow_depth_2  |      96.6667 |                            15 |          18.0643 |  9.30429 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NASA-U_snow_depth_2.png" /></p> 
 </details>

<details> <summary>Station: NEEM, Measurement: air_temperature_1</summary> 
 <br/> <pre>|    | Station   | Measurement       |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:------------------|-------------:|------------------------------:|-----------------:|---------:|
| 37 | NEEM      | air_temperature_1 |            5 |                            10 |         -45.6529 | -36.1943 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NEEM_air_temperature_1.png" /></p> 
 </details>

<details> <summary>Station: NEEM, Measurement: air_temperature_2</summary> 
 <br/> <pre>|    | Station   | Measurement       |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:------------------|-------------:|------------------------------:|-----------------:|---------:|
| 38 | NEEM      | air_temperature_2 |            5 |                            10 |         -45.0371 | -35.9807 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NEEM_air_temperature_2.png" /></p> 
 </details>

<details> <summary>Station: NEEM, Measurement: relative_humidity_2</summary> 
 <br/> <pre>|    | Station   | Measurement         |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------------|-------------:|------------------------------:|-----------------:|---------:|
| 39 | NEEM      | relative_humidity_2 |      5.55556 |                             9 |          58.2257 |  68.7543 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NEEM_relative_humidity_2.png" /></p> 
 </details>

<details> <summary>Station: NEEM, Measurement: relative_humidity_2_cor</summary> 
 <br/> <pre>|    | Station   | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 40 | NEEM      | relative_humidity_2_cor |      5.55556 |                             9 |          88.9786 |  97.0271 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/NEEM_relative_humidity_2_cor.png" /></p> 
 </details>

<details> <summary>Station: Saddle, Measurement: air_pressure</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
| 15 | Saddle    | air_pressure  |      92.1053 |                            19 |          732.733 |  722.094 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Saddle_air_pressure.png" /></p> 
 </details>

<details> <summary>Station: Saddle, Measurement: air_temperature_2</summary> 
 <br/> <pre>|    | Station   | Measurement       |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:------------------|-------------:|------------------------------:|-----------------:|---------:|
| 16 | Saddle    | air_temperature_2 |         92.5 |                            20 |         -24.9329 | -30.3336 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Saddle_air_temperature_2.png" /></p> 
 </details>

<details> <summary>Station: Saddle, Measurement: relative_humidity_1</summary> 
 <br/> <pre>|    | Station   | Measurement         |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------------|-------------:|------------------------------:|-----------------:|---------:|
| 17 | Saddle    | relative_humidity_1 |      97.3684 |                            19 |          93.2271 |  75.4329 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Saddle_relative_humidity_1.png" /></p> 
 </details>

<details> <summary>Station: Saddle, Measurement: relative_humidity_1_cor</summary> 
 <br/> <pre>|    | Station   | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 18 | Saddle    | relative_humidity_1_cor |      97.3684 |                            19 |          118.826 |  97.5843 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Saddle_relative_humidity_1_cor.png" /></p> 
 </details>

<details> <summary>Station: Saddle, Measurement: snow_depth_1</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
| 19 | Saddle    | snow_depth_1  |      91.1765 |                            17 |          18.3371 |  10.1514 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Saddle_snow_depth_1.png" /></p> 
 </details>

<details> <summary>Station: Saddle, Measurement: snow_depth_2</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
| 20 | Saddle    | snow_depth_2  |      97.0588 |                            17 |          19.7214 |  10.6814 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Saddle_snow_depth_2.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: air_temperature_2</summary> 
 <br/> <pre>|    | Station    | Measurement       |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:------------------|-------------:|------------------------------:|-----------------:|---------:|
|  7 | South Dome | air_temperature_2 |      91.6667 |                            18 |           -22.34 | -28.0143 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_air_temperature_2.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: relative_humidity_1</summary> 
 <br/> <pre>|    | Station    | Measurement         |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:--------------------|-------------:|------------------------------:|-----------------:|---------:|
|  8 | South Dome | relative_humidity_1 |      91.6667 |                            18 |          93.7871 |  76.4064 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_relative_humidity_1.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: relative_humidity_1_cor</summary> 
 <br/> <pre>|    | Station    | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
|  9 | South Dome | relative_humidity_1_cor |      97.2222 |                            18 |          117.536 |  101.948 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_relative_humidity_1_cor.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: relative_humidity_2</summary> 
 <br/> <pre>|    | Station    | Measurement         |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:--------------------|-------------:|------------------------------:|-----------------:|---------:|
| 12 | South Dome | relative_humidity_2 |       90.625 |                            16 |          85.6071 |  77.3793 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_relative_humidity_2.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: relative_humidity_2_cor</summary> 
 <br/> <pre>|    | Station    | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 13 | South Dome | relative_humidity_2_cor |       96.875 |                            16 |          106.274 |  99.8864 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_relative_humidity_2_cor.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: shortwave_incoming_radiation</summary> 
 <br/> <pre>|    | Station    | Measurement                  |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:-----------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 10 | South Dome | shortwave_incoming_radiation |       90.625 |                            16 |          5.96571 |  4.63048 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_shortwave_incoming_radiation.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: specific_humidity_1</summary> 
 <br/> <pre>|    | Station    | Measurement         |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:--------------------|-------------:|------------------------------:|-----------------:|---------:|
| 11 | South Dome | specific_humidity_1 |      92.8571 |                             7 |         0.875714 | 0.495714 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_specific_humidity_1.png" /></p> 
 </details>

<details> <summary>Station: South Dome, Measurement: specific_humidity_2</summary> 
 <br/> <pre>|    | Station    | Measurement         |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:--------------------|-------------:|------------------------------:|-----------------:|---------:|
| 14 | South Dome | specific_humidity_2 |      91.6667 |                             6 |             0.85 | 0.581429 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/South Dome_specific_humidity_2.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: relative_humidity_1_cor</summary> 
 <br/> <pre>|    | Station    | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 29 | Swiss Camp | relative_humidity_1_cor |      91.1765 |                            17 |           109.02 |  100.726 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_relative_humidity_1_cor.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: relative_humidity_2_cor</summary> 
 <br/> <pre>|    | Station    | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 30 | Swiss Camp | relative_humidity_2_cor |       96.875 |                            16 |          115.697 |  103.655 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_relative_humidity_2_cor.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: snow_temperature_10</summary> 
 <br/> <pre>|    | Station    | Measurement         |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:--------------------|-------------:|------------------------------:|-----------------:|---------:|
| 31 | Swiss Camp | snow_temperature_10 |      91.1765 |                            17 |         -7.90857 | -9.63714 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_snow_temperature_10.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: snow_temperature_4</summary> 
 <br/> <pre>|    | Station    | Measurement        |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:-------------------|-------------:|------------------------------:|-----------------:|---------:|
| 32 | Swiss Camp | snow_temperature_4 |       96.875 |                            16 |         -4.66857 | -8.19357 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_snow_temperature_4.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: snow_temperature_6</summary> 
 <br/> <pre>|    | Station    | Measurement        |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:-------------------|-------------:|------------------------------:|-----------------:|---------:|
| 33 | Swiss Camp | snow_temperature_6 |      91.1765 |                            17 |            -5.19 | -8.11714 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_snow_temperature_6.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: snow_temperature_7</summary> 
 <br/> <pre>|    | Station    | Measurement        |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:-------------------|-------------:|------------------------------:|-----------------:|---------:|
| 34 | Swiss Camp | snow_temperature_7 |       90.625 |                            16 |         -5.68429 | -8.19286 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_snow_temperature_7.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: snow_temperature_8</summary> 
 <br/> <pre>|    | Station    | Measurement        |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:-------------------|-------------:|------------------------------:|-----------------:|---------:|
| 35 | Swiss Camp | snow_temperature_8 |      96.6667 |                            15 |         -4.57143 | -8.52857 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_snow_temperature_8.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp, Measurement: snow_temperature_9</summary> 
 <br/> <pre>|    | Station    | Measurement        |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:-----------|:-------------------|-------------:|------------------------------:|-----------------:|---------:|
| 36 | Swiss Camp | snow_temperature_9 |       90.625 |                            16 |         -7.11857 |    -8.59 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp_snow_temperature_9.png" /></p> 
 </details>

<details> <summary>Station: Swiss Camp 10m, Measurement: relative_humidity_2_cor</summary> 
 <br/> <pre>|    | Station        | Measurement             |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:---------------|:------------------------|-------------:|------------------------------:|-----------------:|---------:|
| 25 | Swiss Camp 10m | relative_humidity_2_cor |       90.625 |                            16 |          88.5957 |  81.6657 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Swiss Camp 10m_relative_humidity_2_cor.png" /></p> 
 </details>

<details> <summary>Station: Tunu-N, Measurement: shortwave_incoming_radiation</summary> 
 <br/> <pre>|    | Station   | Measurement                  |   Percentile |   Number of Comparison Values |   Original Value |    Median |
|---:|:----------|:-----------------------------|-------------:|------------------------------:|-----------------:|----------:|
| 26 | Tunu-N    | shortwave_incoming_radiation |      96.1538 |                            13 |         0.127143 | 0.0485714 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Tunu-N_shortwave_incoming_radiation.png" /></p> 
 </details>

<details> <summary>Station: Tunu-N, Measurement: snow_depth_1</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
| 27 | Tunu-N    | snow_depth_1  |      96.4286 |                            14 |          7.81286 |  3.25571 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Tunu-N_snow_depth_1.png" /></p> 
 </details>

<details> <summary>Station: Tunu-N, Measurement: snow_depth_2</summary> 
 <br/> <pre>|    | Station   | Measurement   |   Percentile |   Number of Comparison Values |   Original Value |   Median |
|---:|:----------|:--------------|-------------:|------------------------------:|-----------------:|---------:|
| 28 | Tunu-N    | snow_depth_2  |      96.4286 |                            14 |             7.54 |  3.41571 |</pre> 
 <p><img alt="Boxplot" src="../figures/to_markdown/Tunu-N_snow_depth_2.png" /></p> 
 </details>



In [9]:
def filter_reports_by_year_month(directory, year, month):
    month_files = []
    for file in os.listdir(directory):
        if file.endswith(".md") and re.search(f"{year}_{month:02d}", file):  # Add the underscore and zero-padding for the month
            month_files.append(file)
    return month_files

def generate_year_month_collapsible(year, report_directory):
    year_collapsible = f"<details> <summary> {year} </summary> \n"
    year_report = f"{year}.md"
    if os.path.isfile(os.path.join(report_directory, year_report)):
        year_collapsible += md(f"[{year_report}](./{year_report})") + "\n"

    for month in range(1, 13):
        month_files = filter_reports_by_year_month(report_directory, year, month)
        if month_files:
            file_links = "\n".join([md(f"[{file}](./{file})") for file in month_files])
            month_collapsible = md(f"<details> <summary> {month} </summary> \n <br/> {file_links} \n </details>")
            year_collapsible += month_collapsible

    year_collapsible += "\n </details>"
    return md(year_collapsible)


In [10]:
from markdown import markdown as md
from tabulate import tabulate


#percentiles = daily_report(data = "GC Net", date = "2019-01-13", measurement= "All", scope='Relative', aws="All", window = 'week', output = "Data")
print(os.getcwd())
def generate_markdown(date='2019-01-13', window = 'month', filename='../markdown_reports/output_report_draft2.md', df=None):
    output = []
    def add_element(element):
        nonlocal output
        if isinstance(element, list):
            for i in element:
                output.append(i)
        else:
            output.append(element)
        #percentiles['Percentile'] = percentiles['Percentile'].div(100).map('{:.1%}'.format)

    def make_report(elements, filename):
        with open(filename, 'w') as f:
            for element in elements:
                f.write(element)
                f.write('\n\n')
    
    if isinstance(df,pd.DataFrame):
        percentiles = df
    else:
        percentiles = daily_report(data = "GC Net", date = date, measurement= "All", scope='Relative', aws="All", window = window, output = "Data")
    percentiles_out = (percentiles
                       #.assign(Percentile=(percentiles['Percentile']
                       #                    .div(100)
                       #                    .map('{:.1%}'.format)))
                       .to_markdown()
    )

    #percentiles_out = percentiles.to_markdown(tablefmt='grid')
    header = md('## Extreme Weather Events Detection \n Meterological outliers for {d}, measured as mean of {w} compared to similar period previous years'.format(d=date,w=window))
    line = md('-------------------------------------------------------------------------------------------------')
    body1 = md('### Outliers by percentiler (filtered top/bottom 10%)')
    add_element([header, line, body1, percentiles_out])

    picture_folder = ()
    row_out = percentiles.iloc[0,:].to_markdown()
    figure_out = md('![Boxplot](/../GEUS-Master-Thesis/figures/to_markdown/fig1.png)')#.format(date=date,aws='relative',measurement='relative_humidity_2_cor_file'))
    print(figure_out)
    add_element([row_out,figure_out])
    #GEUS-Master-Thesis\figures\to_markdown\box_plot_2019-01-13_relative_humidity_2_cor_file.png
    collapse_element = md('<details> <summary>This collapsible can contain more detailed information </summary> \n <br/> insert text here  \n </details>''')
    add_element(collapse_element)
    

    make_report(output, filename)
generate_markdown()


c:\Users\mabj16ac\Desktop\Thesis\GEUS-Master-Thesis\scripts


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['group_number'] = (df
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year'] = grouped['Datetime'].transform(lambda x: x.max().year)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target_date'] = grouped['Datetime'].transform(lambda x: (x ==date).max())


Calculating Percentiles .... 
Finished Calculating Percentiles
Transforming Output...
Finished
----------------------------------------------------------------------------------------------------------------------


<p><img alt="Boxplot" src="/../GEUS-Master-Thesis/figures/to_markdown/fig1.png" /></p>


In [14]:
# This bit to generate the front page markdown

import os
import re

def filter_reports_by_year_month(directory, year, month):
    month_files = []
    for file in os.listdir(directory):
        if file.endswith(".md") and re.search(f"{year}_{month:02d}", file):  # Add the underscore and zero-padding for the month
            month_files.append(file)
    return month_files

def generate_year_month_collapsible(year, report_directory):
    year_collapsible = f"<details> <summary> {year} </summary> \n"
    year_report = f"{year}.md"
    if os.path.isfile(os.path.join(report_directory, year_report)):
        year_collapsible += md(f"[{year_report}](./{year_report})") + "\n"

    for month in range(1, 13):
        month_files = filter_reports_by_year_month(report_directory, year, month)
        if month_files:
            file_links = "\n".join([md(f"[{file}](./{file})") for file in month_files])
            month_collapsible = md(f"<details> <summary> {month} </summary> \n <br/> {file_links} \n </details>")
            year_collapsible += month_collapsible

    year_collapsible += "\n </details>"
    return md(year_collapsible)

from markdown import markdown as md
from tabulate import tabulate


#percentiles = daily_report(data = "GC Net", date = "2019-01-13", measurement= "All", scope='Relative', aws="All", window = 'week', output = "Data")
print(os.getcwd())
def generate_markdown(date='2019-01-13', window = 'week', filename='../README.md', df=None):
    output = []
    def add_element(element):
        nonlocal output
        if isinstance(element, list):
            for i in element:
                output.append(i)
        else:
            output.append(element)
        #percentiles['Percentile'] = percentiles['Percentile'].div(100).map('{:.1%}'.format)

    def make_report(elements, filename):
        with open(filename, 'w') as f:
            for element in elements:
                f.write(element)
                f.write('\n\n')
    
    if isinstance(df,pd.DataFrame):
        percentiles = df
    else:
        percentiles = daily_report(data = "GC Net", date = date, measurement= "All", scope='Relative', aws="All", window = window, output = "Data")
    percentiles_out = (percentiles
                       #.assign(Percentile=(percentiles['Percentile']
                       #                    .div(100)
                       #                    .map('{:.1%}'.format)))
                       .to_markdown()
    )

    linked_report_path = "output_report_draft2.md"
    linked_report = md(f"[View Output Report Draft 2]({linked_report_path})")

    header = md('## Extreme Weather Events Detection')
    line = md('-------------------------------------------------------------------------------------------------')
    body1 = md('### \n This page displays an overview of the yearly and monthly reports that are generated for climatology reporting. \n \n The climatology values are calculated as the values of the respective time period in comparison to all other values within the same time period. \n \n')
    body2 = md('### You find the yearly and monthly reports under the following links.')    
    
    
    add_element([header])
    add_element([line, body1, body2])

    report_directory = "../markdown_reports"
    years = filter_reports_by_year(report_directory)
    for year in years:
        year_collapsible = generate_year_month_collapsible(year, report_directory)
        if "<summary>" in year_collapsible:  # Check if there is any month with reports for the year
            add_element(year_collapsible)


    
    picture_folder = ()
    row_out = percentiles.iloc[0,:].to_markdown()
    figure_out = md('![Boxplot](../figures/to_markdown/Boxplot.png)')#.format(date=date,aws='relative',measurement='relative_humidity_2_cor_file'))
    #print(figure_out)
    #add_element([row_out,figure_out])
    #GEUS-Master-Thesis\figures\to_markdown\box_plot_2019-01-13_relative_humidity_2_cor_file.png
    body2 = md('### \n You can run a custom climatology report by running the code in the following link: \n')
    linked_code_path = "../scripts/Climatology.ipynb"
    linked_code = md(f"[Custom Climatology Report]({linked_code_path})")
    add_element(body2)
    add_element(linked_code)

    make_report(output, filename)
generate_markdown()


c:\Users\mabj16ac\Desktop\Thesis\GEUS-Master-Thesis\scripts



Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in 

Unnamed: 0.1,file,Unnamed: 0,shortwave_incoming_radiation,shortwave_outgoing_radiation,net_radiation,air_temperature_1,air_temperature_1_max,air_temperature_1_min,air_temperature_cs500_air1,air_temperature_cs500_air2,...,snow_temperature_8,snow_temperature_9,snow_temperature_10,incoming_uv_radiation,incoming_longwave_radiation,surface_temperature_1,surface_temperature_2,net_radiation_maximum,year,target_date
0,Aurora,29520.428571,4.381429,,-8.035714,-21.310000,,,-24.174286,,...,,,,,,,,,2001,False
1,CP2,11060.571429,0.335714,0.262857,-3.654286,-24.995714,,,-24.817143,-24.924286,...,,,,,,,,,1998,False
2,CP2,16520.000000,0.605714,0.416667,-2.337143,-35.782857,,,,,...,,,,,,,,,1999,False
3,CP2,22656.000000,0.617143,0.235714,-0.785714,-23.361429,,,-22.954286,,...,,,,,,,,,2000,False
4,CP2,29522.000000,0.297143,0.235714,-0.427143,-24.524286,,,-23.541429,-23.527143,...,,,,,,,,,2001,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,Tunu-N,141268.714286,0.075714,0.028571,6.641429,-37.485714,-32.235714,-42.037143,,,...,,,,,,,,,2018,False
443,Tunu-N,147455.857143,0.127143,0.028571,3.446667,-42.122857,-38.255714,-45.430000,,,...,,,,,,,,,2019,True
444,Tunu-N,153166.714286,0.180000,0.032857,77.252857,-35.005714,-30.904286,-40.005714,,,...,,,,,,,,,2020,False
445,Tunu-N,158186.142857,0.034286,0.030000,10.154286,-31.845714,-27.317143,-36.311429,-30.735000,-31.322857,...,,,,,,,,,2021,False


Calculating Percentiles .... 
Finished Calculating Percentiles
Transforming Output...
Finished
----------------------------------------------------------------------------------------------------------------------




NameError: name 'filter_reports_by_year' is not defined

In [11]:
def comparison_visualization(station_in = 'Swiss Camp',measurement='relative_humidity_2_cor', window='month', date='2021-12-31', fig_title=f'../figures/Climatology/test_fig.png'):
    df = _load_data('GC_Net')
    station, datetime, dayofcentury, dayofyear = _align_GC_PR()
    day, df,columns, day_century, unique_files = _subset_df(date=date, df=df, station=station, datetime=datetime, measurement=measurement, dayofcentury=dayofcentury, dayofyear=dayofcentury, aws='All')
    df = df.loc[df['file'] == station_in ]
    df = df[[measurement,'Datetime']].set_index('Datetime')
    display(df)

    if window == 'week':
        df_grouped = df.groupby(pd.Grouper( freq='W'))          
        df_mean = df_grouped.mean()
        df = df_mean.dropna()

    elif window == 'sliding_avg_7':
        df['Datetime'] = df.index
        df['date_diff'] = (df['Datetime'].max() - df['Datetime']).dt.days
        df['week_group'] = (df['date_diff'] / 7).apply(lambda x: int(x) if x.is_integer() else int(x) + 1)
        df_grouped = df.groupby('week_group')[measurement]
        weekly_mean = df_grouped.mean()
        weekly_mean = weekly_mean.reset_index()
        weekly_mean['last_date'] = weekly_mean['week_group'].apply(lambda x: df['Datetime'].max() - pd.to_timedelta(x * 7, unit='D'))
        weekly_mean.set_index('last_date', inplace=True)
        weekly_mean.drop(columns=['week_group'], inplace=True)
        df = weekly_mean.dropna()
        
    elif window == 'month':
        df_grouped = df.groupby(pd.Grouper(freq='M'))
        df_mean = df_grouped.mean()
        df = df_mean.dropna()
        

    elif window == 'sliding_avg_30':
        df['Datetime'] = df.index
        df['date_diff'] = (df['Datetime'].max() - df['Datetime']).dt.days
        df['week_group'] = (df['date_diff'] / 30).apply(lambda x: int(x) if x.is_integer() else int(x) + 1)
        df_grouped = df.groupby('week_group')[measurement]
        monthly_mean = df_grouped.mean()
        monthly_mean = monthly_mean.reset_index()
        monthly_mean['last_date'] = monthly_mean['week_group'].apply(lambda x: df['Datetime'].max() - pd.to_timedelta(x * 30, unit='D'))
        monthly_mean.set_index('last_date', inplace=True)
        monthly_mean.drop(columns=['week_group'], inplace=True)
        df = monthly_mean.dropna()

    elif window == 'year':
        df_grouped = df.groupby(pd.Grouper(freq='M'))           
        df_mean = df_grouped.mean()
        df = df_mean.dropna()
    else:
        raise ValueError(f"The input for the variable 'window' was not recognizable.")

    selected_year = date[:4] 
    # Extract unique years from the index
    years = list(df.index.year.unique().sort_values(ascending=False))
    print(years)
    print(selected_year)
    years.remove(int(selected_year))
    
    data = []

    # Calculate the grayscale step based on the number of years
    grayscale_step = 1.0 / (len(years) - 1)

    # Loop through the unique years and plot the development of the value over the year
    for i, year in enumerate(years):
        df_year = df[df.index.year == year]

        # Reset the year part of the datetime index to a constant year (e.g., 2000)
        df_year.index = df_year.index.map(lambda x: x.replace(year=2000))

        color = f'rgba({int(max(0, grayscale_step * (i - 1) * 255))}, {int(max(0, grayscale_step * (i - 1) * 255))}, {int(max(0, grayscale_step * (i - 1) * 255))}, 1)'
        linewidth = 1
        
        data.append(go.Scatter(x=df_year.index, y=df_year[measurement], name=f'{year}', line=dict(color=color, width=linewidth), mode='lines', legendgroup='custom', showlegend=False))

    df_year = df[df.index.year == int(selected_year)]

    # Reset the year part of the datetime index to a constant year (e.g., 2000)
    df_year.index = df_year.index.map(lambda x: x.replace(year=2000))
    color = 'red'
    linewidth = 2.5
    data.append(go.Scatter(x=df_year.index, y=df_year[measurement], name=f'{selected_year}', line=dict(color=color, width=linewidth), mode='lines', legendgroup='custom', showlegend=False))

    # Create a plot
    fig = go.Figure(data=data)

    # Set title, labels
    fig.update_layout(
        title= {
            'text':f'Development of {measurement} by year for {station}',
            'font':{'size':24}
        },
        xaxis_title='Month',
        yaxis_title=measurement,
        xaxis=dict(
            tickmode='array',
            tickvals=pd.date_range('2000-01-01', '2000-12-31', freq='M'),
            ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        ),
        width=1400,
        height=400,
        legend=dict(title='Legend', orientation='v', yanchor='top', xanchor='left', y=1, x=1.06)
        )
    

    # Custom legend
    latest_date = df.index.max().strftime('%d-%m')
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{selected_year}', line=dict(color='red'), legendgroup='custom'))
    if selected_year == df.index.year.max():
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{years[1]}', line=dict(color='black'), legendgroup='custom'))
    else:
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{years[0]}', line=dict(color='black'), legendgroup='custom')) 
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name='...', line=dict(color='grey'), legendgroup='custom'))
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{years[-1]}', line=dict(color='lightgrey'), legendgroup='custom'))
    
    # Add a vertical line indicating the latest date in the chart
    if window != 'year':
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{latest_date}', line=dict(color='black', dash='dash'), legendgroup='custom'))
        fig.add_shape(type='line', x0=date, x1=date, y0=0, y1=1, yref='paper',
                    line=dict(color='black', dash='dash'))
    fig.write_image(fig_title)
    # Display the plot
    fig.show()

comparison_visualization()

def gen_boxplot(date="2019-01-13", measurement="relative_humidity_1",station="Swiss Camp 10m", window='month', fig_title=f'../figures/Climatology/test_fig.png'):
    df = get_data(data = "GC Net", date = date, measurement= measurement, scope='Relative', output = "Data", aws=station, window=window)

    row_index = int(df[df['year'] == int(date[:4])].index[0])
    display(df.iloc[row_index])
    #print(row_index)
    percentile_dict = {}
    #print(col)
    value = df[measurement][row_index]
    print(value)
    #display(df[col].rank(pct=True)[df[col] == value].iloc[0])
    #print(df[col].rank(pct=True))
    percentile = df[measurement].rank(pct=True)[df[measurement] == value].iloc[0]*100 if not math.isnan(value) else math.nan
    percentile_dict[measurement] = percentile

    x = pd.DataFrame(percentile_dict, index=[0])

    # Create a subplot for each column 
    fig = px.box(df[measurement], orientation = "v",boxmode='group')
    # Format the axes
    fig.update_layout(title_text= f"Boxplot for {measurement}", xaxis_title='', yaxis_title='')
    # Highlight the values from x
    fig.add_scatter(x=[measurement], y=[value], name= f"Selected Value \n{measurement}",
    mode = 'markers',
    marker_symbol = 'circle-dot',
    marker_size = 8,
    marker_color = 'red')
    
    fig.write_image(fig_title)
    # Display the plot
    #fig.show()

t = gen_boxplot()



Unnamed: 0_level_0,relative_humidity_2_cor
Datetime,Unnamed: 1_level_1
1995-01-01,
1995-01-02,
1995-01-03,
1995-01-04,
1995-01-05,
...,...
2022-07-30,107.49
2022-07-31,98.40
2022-08-01,88.60
2022-08-02,104.88


[2022, 2021, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995]
2021




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



file                   Swiss Camp 10m
relative_humidity_1         77.179355
year                             2019
Name: 28, dtype: object

77.17935483870967


In [24]:
data = _get_data('promice')
display(df.columns)
columns = sorted(list(df.columns))
for i in range(len(columns)):
               print(columns[i])

Index(['stid', 'date', 'Air pressure (upper boom)',
       'Air temperature (upper boom)', 'Relative humidity (upper boom)',
       'Relative humidity (upper boom) - corrected',
       'Specific humidity (upper boom)', 'Wind speed (upper boom)',
       'Wind from direction (upper boom)', 'Downwelling shortwave radiation',
       'Downwelling shortwave radiation - corrected',
       'Upwelling shortwave radiation',
       'Upwelling shortwave radiation - corrected', 'Albedo',
       'Downwelling longwave radiation', 'Upwelling longwave radiation',
       'Cloud cover', 'Surface temperature', 'Latent heat flux (upper boom)',
       'Sensible heat flux (upper boom)', 'Upper boom height', 'Stake height',
       'Depth of pressure transducer in ice',
       'Depth of pressure transducer in ice - corrected',
       'Precipitation (upper boom) (cumulative solid & liquid)',
       'Precipitation (upper boom) (cumulative solid & liquid) – corrected',
       'Ice temperature at sensor 1', 'Ice t

Air pressure (lower boom)
Air pressure (upper boom)
Air temperature (lower boom)
Air temperature (upper boom)
Albedo
Altitude
Battery voltage
Battery voltage (sample start)
Cloud cover
Datetime
DayOfCentury
DayOfYear
Depth of ice temperature measurement 1
Depth of ice temperature measurement 10
Depth of ice temperature measurement 11
Depth of ice temperature measurement 2
Depth of ice temperature measurement 3
Depth of ice temperature measurement 4
Depth of ice temperature measurement 5
Depth of ice temperature measurement 6
Depth of ice temperature measurement 7
Depth of ice temperature measurement 8
Depth of ice temperature measurement 9
Depth of pressure transducer in ice
Depth of pressure transducer in ice - corrected
Downwelling longwave radiation
Downwelling shortwave radiation
Downwelling shortwave radiation - corrected
Fan current (lower boom)
Fan current (upper boom)
Frequency of vibrating wire in precipitation gauge
GPS horizontal dillution of precision (HDOP)
GPS number of s

In [44]:
def comparison_visualization(station_in = 'SCO_L',measurement='Depth of pressure transducer in ice - corrected', window='year', date='2019-07-1', fig_title=f'../figures/Climatology/test_fig.png'):
    df = _load_data('GC_net')
    df['Albedo'] == df['Albedo'].bfill()
    station, datetime, dayofcentury, dayofyear = _align_GC_PR()
    day, df,columns, day_century, unique_files = _subset_df(date=date, df=df, station=station, datetime=datetime, measurement=measurement, dayofcentury=dayofcentury, dayofyear=dayofcentury, aws='All')
    df = df.loc[df['stid'] == station_in ]
    df = df[[measurement,'Datetime']].set_index('Datetime')

    display(df)

    if window == 'week':
        df_grouped = df.groupby(pd.Grouper( freq='W'))          
        df_mean = df_grouped.mean()
        df = df_mean.dropna()

    elif window == 'sliding_avg_7':
        df['Datetime'] = df.index
        df['date_diff'] = (df['Datetime'].max() - df['Datetime']).dt.days
        df['week_group'] = (df['date_diff'] / 7).apply(lambda x: int(x) if x.is_integer() else int(x) + 1)
        df_grouped = df.groupby('week_group')[measurement]
        weekly_mean = df_grouped.mean()
        weekly_mean = weekly_mean.reset_index()
        weekly_mean['last_date'] = weekly_mean['week_group'].apply(lambda x: df['Datetime'].max() - pd.to_timedelta(x * 7, unit='D'))
        weekly_mean.set_index('last_date', inplace=True)
        weekly_mean.drop(columns=['week_group'], inplace=True)
        df = weekly_mean.dropna()
        
    elif window == 'month':
        df_grouped = df.groupby(pd.Grouper(freq='M'))
        df_mean = df_grouped.mean()
        df = df_mean.dropna()
        

    elif window == 'sliding_avg_30':
        df['Datetime'] = df.index
        df['date_diff'] = (df['Datetime'].max() - df['Datetime']).dt.days
        df['week_group'] = (df['date_diff'] / 30).apply(lambda x: int(x) if x.is_integer() else int(x) + 1)
        df_grouped = df.groupby('week_group')[measurement]
        monthly_mean = df_grouped.mean()
        monthly_mean = monthly_mean.reset_index()
        monthly_mean['last_date'] = monthly_mean['week_group'].apply(lambda x: df['Datetime'].max() - pd.to_timedelta(x * 30, unit='D'))
        monthly_mean.set_index('last_date', inplace=True)
        monthly_mean.drop(columns=['week_group'], inplace=True)
        df = monthly_mean.dropna()

    elif window == 'year':
        df = df.loc[df.index <= date]
        df_grouped = df.groupby(pd.Grouper(freq='M'))           
        df_mean = df_grouped.mean()
        df = df_mean.dropna()
    else:
        raise ValueError(f"The input for the variable 'window' was not recognizable.")

    selected_year = date[:4] 
    # Extract unique years from the index
    years = list(df.index.year.unique().sort_values(ascending=False))
    print(years)
    print(selected_year)
    years.remove(int(selected_year))
    
    data = []

    # Calculate the grayscale step based on the number of years
    grayscale_step = 1.0 / (len(years) - 1)

    # Loop through the unique years and plot the development of the value over the year
    for i, year in enumerate(years):
        df_year = df[df.index.year == year]

        # Reset the year part of the datetime index to a constant year (e.g., 2000)
        df_year.index = df_year.index.map(lambda x: x.replace(year=2000))

        color = f'rgba({int(max(0, grayscale_step * (i - 1) * 255))}, {int(max(0, grayscale_step * (i - 1) * 255))}, {int(max(0, grayscale_step * (i - 1) * 255))}, 1)'
        linewidth = 1
        
        data.append(go.Scatter(x=df_year.index, y=df_year[measurement], name=f'{year}', line=dict(color=color, width=linewidth), mode='lines', legendgroup='custom', showlegend=False))

    df_year = df[df.index.year == int(selected_year)]

    # Reset the year part of the datetime index to a constant year (e.g., 2000)
    df_year.index = df_year.index.map(lambda x: x.replace(year=2000))
    color = 'red'
    linewidth = 2.5
    data.append(go.Scatter(x=df_year.index, y=df_year[measurement], name=f'{selected_year}', line=dict(color=color, width=linewidth), mode='lines', legendgroup='custom', showlegend=False))

    # Create a plot
    fig = go.Figure(data=data)

    # Set title, labels
    fig.update_layout(
        title= {
            'text':f'Development of {measurement} by year for {station}',
            'font':{'size':24}
        },
        xaxis_title='Month',
        yaxis_title=measurement,
        xaxis=dict(
            tickmode='array',
            tickvals=pd.date_range('2000-01-01', '2000-12-31', freq='M'),
            ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        ),
        width=1400,
        height=400,
        legend=dict(title='Legend', orientation='v', yanchor='top', xanchor='left', y=1, x=1.06)
        )
    

    # Custom legend
    latest_date = df.index.max().strftime('%d-%m')
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{selected_year}', line=dict(color='red'), legendgroup='custom'))
    if selected_year == df.index.year.max():
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{years[1]}', line=dict(color='black'), legendgroup='custom'))
    else:
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{years[0]}', line=dict(color='black'), legendgroup='custom')) 
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name='...', line=dict(color='grey'), legendgroup='custom'))
    fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{years[-1]}', line=dict(color='lightgrey'), legendgroup='custom'))
    
    # Add a vertical line indicating the latest date in the chart
    if window != 'year':
        fig.add_trace(go.Scatter(x=[None], y=[None], mode='lines', name=f'{latest_date}', line=dict(color='black', dash='dash'), legendgroup='custom'))
        fig.add_shape(type='line', x0=date, x1=date, y0=0, y1=1, yref='paper',
                    line=dict(color='black', dash='dash'))
    fig.write_image(fig_title)
    # Display the plot
    fig.show()

comparison_visualization()

Unnamed: 0_level_0,Depth of pressure transducer in ice - corrected
Datetime,Unnamed: 1_level_1
2008-07-22,18.504409
2008-07-23,18.458625
2008-07-24,18.417792
2008-07-25,18.375333
2008-07-26,18.330542
...,...
2023-01-29,-26.481000
2023-01-30,-26.386000
2023-01-31,-26.441000
2023-02-01,-26.455000


[2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008]
2019


***Test Suite***

In [396]:
# Excluding NAN's for the calculation
col_list =  x.iloc[:,2].dropna().values.tolist()

x['Percentile'] = x.apply(lambda row : stats.percentileofscore(col_list, row[2], kind = "mean"), axis=1)


In [397]:
x

Unnamed: 0,file,relative_humidity_1,year,Percentile
0,Swiss Camp 10m,,1991,1.724138
1,Swiss Camp 10m,,1992,5.172414
2,Swiss Camp 10m,,1993,8.62069
3,Swiss Camp 10m,,1994,12.068966
4,Swiss Camp 10m,,1995,15.517241
5,Swiss Camp 10m,,1996,18.965517
6,Swiss Camp 10m,,1997,22.413793
7,Swiss Camp 10m,,1998,25.862069
8,Swiss Camp 10m,66.55,1999,29.310345
9,Swiss Camp 10m,89.52,2000,32.758621


In [398]:
# Using plotly.express
import plotly.express as px

#fig = px.line(x, x='Datetime', y="relative_humidity_1")
#fig.show()
fig = go.Figure([go.Scatter(x=x['year'], y=x['relative_humidity_1'])])
fig.show()

In [406]:
def report(data, date, station, variable, scope):
    """
    Function to return the percentile of specified values given a selected date.
    
    Parameters:
    df (pd.DataFrame): The dataframe to be used.
    date (str): The date of the observations for which the percentile is calculated.
    
    Returns:
    df (pd.DataFrame): A dataframe with all numerical columns and the percentiles of the values of the selected date.
    """
  
    # Load Data
    df = _load_data(data)
    
    # Split date input into year, month, day
    year = int(date[0:4])
    month = int(date[5:7])
    day = int(date[8:10])
  
    # subset df with date and find day of year and day variable
    date_df = df.loc[(df['Datetime'].dt.year == year) & (df['Datetime'].dt.month == month) & (df['Datetime'].dt.day == day)]
      # select day of year
    day =  date_df["DayOfYear"].mean()
    datetime = date_df["Datetime"].max()
  
    # select the specific day of century and the related values
    day_century = date_df["DayOfCentury"].mean() 
    day_century_value = pd.DataFrame(pd.DataFrame.mean(date_df)).reset_index()
    day_century_value = pd.pivot_table(day_century_value, index=None, columns=['index'], aggfunc=max)
  
    if scope == "relative":
        # group by calender day
        df = df.loc[df['DayOfYear'] == day]
    elif scope == "absolute":
        df = df
    else: 
        raise ValueError("Only 'relative' & 'absolute' are accepted input values")
  
    # Find the index with the specified date
    row_index = int(df[df['Datetime'] == date].index[0])
    
    # Remove columns that do not contain numerical values & Subset df based on measure selection
    if variable == "All":
        df = df.select_dtypes(include=['int', 'float']).copy()
    else:
        df = pd.DataFrame({variable: df[variable]})
        df = df.select_dtypes(include=['int', 'float']).copy()
    
    # Create an empty dictionary for the output
    percentile_dict = {}
    
    # Iterate through the columns
    for col in df.columns:
        # Find the percentile of the value in the specified row and date
        value = df[col][row_index]
        percentile = df[col].rank(pct=True)[df[col] == value].iloc[0]*100 if not math.isnan(value) else math.nan
    
        # Add the percentile to the dictionary
        percentile_dict[col] = percentile
    print('displaying percentile dict')
    print(percentile_dict)
    # Create a dataframe with the output
    x = pd.DataFrame(percentile_dict, index=[0])
    print('displaying percentile df')
    import prettytable as pt
  
    table = pt.PrettyTable()
    table.field_names = ["Measurement", "Percentile"]
  
    for col in x.columns:
        if x[col].values > 90 or x[col].values < 10:
            table.add_row([f"\033[1m{col}\033[0m", f"\033[1m{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}%\033[0m"])
        else:
            table.add_row([col, f"{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}%"])
  
    print(
    f" Date: {date} \n" ,
    f"Station: {station} \n" , 
    f"Measurement: {variable} \n"  , 
    f"Data: {data} \n" ,
    "----------------------------------------------------------------------------------------------------------------------\n",
    f"                   Climatology Report\n" 
    )
    print(table)
    print("----------------------------------------------------------------------------------------------------------------------\n")
    
    import plotly.express as px
    # Create a list of columns to be plotted
    x = day_century_value
    columns_to_plot = [col for col in x.columns.values if col in df.columns.values]
  
    # Create a list of values from x to be highlighted
    values_to_highlight = x[columns_to_plot].values.flatten().tolist()
  
    # Create a list of subplots
    figs = []
    # Loop through list of columns
    for col, v in zip(columns_to_plot, values_to_highlight) : 
        # Create a subplot for each column 
        fig = px.box(df[col], orientation = "v",boxmode='group')
        # Format the axes
        fig.update_layout(title_text= f"Boxplot for {col}", xaxis_title='', yaxis_title='')
        # Highlight the values from x
        fig.add_scatter(x=[col], y=[v], name= f"Selected Value \n{col}",
    mode = 'markers',
    marker_symbol = 'circle-dot',
    marker_size = 8,
    marker_color = 'red')
        
        # Add figure to list of subplots 
        figs.append(fig)
    
    # Output
    for fig in figs:
        fig.show()

In [405]:
report("Promice", "2022-01-12", "THU_L_day_v03", variable = "AirPressure(hPa)", scope = "relative")

displaying percentile dict
{'AirPressure(hPa)': 61.940298507462686}
displaying percentile df
 Date: 2022-01-12 
 Station: THU_L_day_v03 
 Measurement: AirPressure(hPa) 
 Data: Promice 
 ----------------------------------------------------------------------------------------------------------------------
                    Climatology Report

+------------------+------------+
|   Measurement    | Percentile |
+------------------+------------+
| AirPressure(hPa) |    61%     |
+------------------+------------+
----------------------------------------------------------------------------------------------------------------------



GC Net Data

In [None]:
gc = pd.read_parquet('data\df_daily.gzip', engine='pyarrow')

In [None]:
gc.columns

*Test on one station*

In [None]:
gc = gc[gc['station_name'] == "Humboldt"] 

Promice Data

In [None]:
pc = pd.read_parquet('data\promice_hourly.gzip', engine='pyarrow')

In [None]:
pc

*Test Suite*

In [None]:
# define variables

# *Mandatory: Data
data = "Promice"

# *Mandatory: Date
date = "22-07-2008"

# Optional: Station
#station = "SCO_L_hour_v03"
station = "THU_L_day_v03"

# Optional: Measure
y = "All"

In [None]:
# subset dataframe 
pc = pc[pc['file'] == station] 

In [None]:
# select dato 
year= 2022
month = 1
day = 12

# subset df with date and find day of year and day variable
date_df = pc.loc[(pc['Datetime'].dt.year == year) & (pc['Datetime'].dt.month == month) & (pc['Datetime'].dt.day == day)]
  # select day of year
day =  date_df["DayOfYear"].mean()
datetime = date_df["Datetime"].max()

# select the specific day of century and the related values
day_century = date_df["DayOfCentury"].mean() 
day_century_value = pd.DataFrame(pd.DataFrame.mean(date_df)).reset_index()
day_century_value = pd.pivot_table(day_century_value, index=None, columns=['index'], aggfunc=max)

# group by calender day
pc_group = pc.loc[pc['DayOfYear'] == day]

In [None]:
def get_percentile(df, date):
  """
  Function to return the percentile of specified values given a selected date.
  
  Parameters:
  df (pd.DataFrame): The dataframe to be used.
  date (str): The date of the observations for which the percentile is calculated.
  
  Returns:
  df (pd.DataFrame): A dataframe with all numerical columns and the percentiles of the values of the selected date.
  """

  # Find the index with the specified date
  row_index = int(df[df['Datetime'] == date].index[0])
  
  # Remove columns that do not contain numerical values & Subset df based on measure selection
  if y == "All":
    df = df.select_dtypes(include=['int', 'float']).copy()
  else:
    df = pd.DataFrame(df[y])
    df = df.select_dtypes(include=['int', 'float']).copy()
  
  # Create an empty dictionary for the output
  percentile_dict = {}
  
  # Iterate through the columns
  for col in df.columns:
    # Find the percentile of the value in the specified row and date
    value = df[col][row_index]
    if(math.isnan(value)): 
      percentile = math.nan
    else: 
      # calculate the percentile with df.rank()
      percentile = df[col].rank(pct=True)[df[col] == value].iloc[0]*100
    
    # Add the percentile to the dictionary
    #percentile_dict[col + "_percentile"] = percentile
    percentile_dict[col] = percentile
  
  
  # Create a dataframe with the output
  df_percentiles = pd.DataFrame(percentile_dict, index=[0])
  return df_percentiles

In [None]:
x = get_percentile(pc, "2022-01-12")

In [None]:
#x = x.append(day_century_value.iloc[0], ignore_index=True)

In [None]:
x = get_percentile(pc_group, "2022-01-12")

In [399]:
import prettytable as pt

table = pt.PrettyTable()
table.field_names = ["Measurement", "Percentile"]

for col in x.columns:
    if x[col].values > 90 or x[col].values < 10:
        table.add_row([f"\033[1m{col}\033[0m", f"\033[1m{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}% \033[0m"])
    else:
        table.add_row([col, f"{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}%"])


print(
f" Date: {date} \n" ,
f"Station: {station} \n" , 
f"Measurment: {y} \n"  , 
f"Data: {data} \n" ,
"-----------------------------------------------------------\n",
f"                   Climatology \n" 
)

print(table)

ModuleNotFoundError: No module named 'prettytable'

*Visual test suite*

In [None]:
############################ Boxplots #########################################
ff = pc 
x = ff.loc[ff['Datetime'] == "2022-01-12"]

import plotly.express as px

# Create a list of columns to be plotted
columns_to_plot = [col for col in x.columns.values if col in ff.columns.values]

# Create a list of values from x to be highlighted
values_to_highlight = x[columns_to_plot].values.flatten().tolist()

# Create a list of subplots
figs = []
# Loop through list of columns
for col, v in zip(columns_to_plot,values_to_highlight) : 
  # Create a subplot for each column 
  fig = px.box(ff[col], orientation = "v",boxmode='group')
  # Format the axes
  fig.update_layout(title_text= f"Boxplot for {col}", xaxis_title='', yaxis_title='')
  # Highlight the values from x
  fig.add_scatter(x=[col], y=[v], name= f"Selected Value \n{col}",
                        mode = 'markers',
                        marker_symbol = 'circle-dot',
                        marker_size = 8,
                        marker_color = 'red')
  
  # Add figure to list of subplots 
  figs.append(fig)

# Show the plots
for fig in figs:
  fig.show()

In [None]:
# A function that calculates the percentiles of every column and their values

def percentile_df(df):
    for col in (df.columns):
        df[f'{col}_pcta'] = df[col].rank(pct=True)
        #df[f'{col}_pcta'] = df[col].rank(pct=True)[df[col] == value] *100

    return df

In [None]:
gg = percentile_df(pc)
gg = gg[["Datetime", "AirTemperature(C)", "AirTemperature(C)_pcta"]]
y = "AirTemperature(C)"
y_pcta = "AirTemperature(C)_pcta"

In [None]:
fig = go.Figure([
    go.Scatter(
        name='Air Pressure (hPa)',
        x=gg['Datetime'],
        y=gg[y],
        mode='lines',
        line=dict(color='rgb(31, 119, 180)'),
    ),
    go.Scatter(
        name='Upper Bound (20-80)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta])),
        mode='lines',
        marker=dict(color="#00BB00"),
        line=dict(width=0),
        showlegend=False
    ),
    go.Scatter(
        name='Lower Bound (20-80)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta])),
        marker=dict(color="#00BB00"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(0, 187, 0, 0.3)',
        fill='tonexty',
        showlegend=False
    ),
    go.Scatter(
        name='Upper Bound (0-20 & 80-100)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta]  )),
        mode='lines',
        marker=dict(color="#BB0000"),
        line=dict(width=0),
        showlegend=False
    ),
    go.Scatter(
        name='Lower Bound (0-20 & 80-100)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta] )),
        marker=dict(color="#BB0000"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(187, 0, 0, 0.3)',
        fill='tonexty',
        showlegend=False
    )
])
fig.update_layout(
    yaxis_title='Air Pressure (hPa)',
    title='Continuous, variable value error bars',
    hovermode="x"
)
fig.show()

In [None]:
gg.loc[(gg['Datetime'].dt.month == 12) & (gg['Datetime'].dt.day == 31)]