Import relevant Libaries 

In [2]:
# Base libraries
import pandas as pd
import numpy as np
import os
import math as math
import datetime
from scipy import stats

# Visualizations
import plotly.express as px
import plotly.graph_objects as go

Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
date = "2022-01-12"

# Split date input into year, month, day
year = int(date[0:4])
month = int(date[5:7])
day = int(date[8:10])

# Read df
df =  pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/promice_daily.gzip', engine='pyarrow')

# Subset df with date
date_df = df.loc[(df['Datetime'].dt.year == year) & (df['Datetime'].dt.month == month) & (df['Datetime'].dt.day == day)]

# Subset df with day of year 
day =  date_df["DayOfYear"].mean()
df = df.loc[df['DayOfYear'] == day]

# find day of century
day_century = date_df["DayOfCentury"].mean() 

#Create a list of unique files from the dataset 
unique_files = list(df['file'].unique())

#Create a list of relevant columns
columns = df.select_dtypes(include=[np.number]).columns

#Create an empty list to hold the percentile values
percentiles = []

#Loop through each file in the unique_files list
for I, file in enumerate(unique_files):
    
    #Calculate the percentile of each numerical column for the specified datetime
    df_file = df[df['file'] == file]

    #Create an empty dictionary
    percentile_dict = {}
    
    for col in columns:
      #Looping through each row of the dataframe
      for index, row in df_file.iterrows():
        row_date = row['DayOfCentury']
        if row_date == day_century and ~ np.isnan(row[col]):
          # Retrieving Value 
          value = row[col]
          # Excluding NAN's for the calculation
          col_list = df_file[col].dropna().values.tolist()
          # Calculate the Percentiles
          percentile = stats.percentileofscore(col_list,value)
          # Assign file, value and, percentile to dictionary
          percentile_dict[col] = row[col]
          percentile_dict[f"{col}_pcte"] = percentile
          
    percentile_dict["file"] = file

      #Add the percentile dictionary to the list
    percentiles.append({'file': file,**percentile_dict})

#Create a dataframe from the list of dictionaries
percentiles_df = pd.DataFrame(percentiles)



**Report Function**

In [45]:
def get_data(data, measurement, aws, date = datetime.datetime.today().strftime('%Y-%m-%d') 
                   ,scope = "Relative", output = "Report"):
    """
    Function to return the underlying dataset of specified values given a selected date.
    
    Parameters:
    data (str): The dataset to be used.
    date (str): The date of the observations for which the percentile is calculated.
    aws (str): The automatic weather station.
    measurement (str): The measurement. 
    scope (str): Relative: values are compared to historical values of the day of year. 
                 Absolute: values are compared to historical values.

    output (str): Output as pd.df or as printed report. 
  
    """

    # Ignore Warnings
    import warnings
    warnings.filterwarnings('ignore')

    # Load data
    if data == "GC Net":
        df = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/df_daily.gzip', engine='pyarrow')
    elif data == "Promice":
        df = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/promice_daily.gzip', engine='pyarrow')
    else: 
        raise ValueError("Only 'GC Net' & 'Promice' are accepted input values")
    
    # Align GC Net & PROMICE Columns  
    ############### TBD ##########
    station = "file"
    datetime = "Datetime"
    dayofcentury = "DayOfCentury"
    dayofyear = 'DayOfYear'
    
    # Split date input into year, month, day
    year = int(date[0:4])
    month = int(date[5:7])
    day = int(date[8:10])

    # Subset df with date
    date_df = df.loc[(df['Datetime'].dt.year == year) & (df['Datetime'].dt.month == month) & (df['Datetime'].dt.day == day)]
    
    # Find the day of year
    day =  date_df["DayOfYear"].mean()

    # subset by measurement
    df = df[[station,datetime, measurement, dayofcentury,dayofyear]]
    
    # subset by station
    df = df.loc[df[station] == aws]

    if scope == "Relative":
        # filter by calender day
        df = df.loc[df['DayOfYear'] == day]
    elif scope == "Absolute":
        df = df
    else: 
        raise ValueError("The input for the variable 'scope' was not recognizable. Please use one of the following options: 'Relative' , 'Absolute'")
    
    ##### Output #######
    if output == "Report":
        from tabulate import tabulate
        #### Report OUTPUT ###########
        print(
          f"  Selected Date: {date} \n" ,
          f"Selected Station: {aws} \n" , 
          f"Selected Measurement: {measurement} \n"  , 
          f"Selected Data: {data} \n" ,
          f"Selected Scope: {scope} \n" ,
          "----------------------------------------------------------------------------------------------------------------------\n",
          f"                   Climatology Report\n" 
          )        
        print(tabulate(df, headers='keys', tablefmt='psql'))
       
   
    elif output == "Data":
      return df
    else: 
        raise ValueError("The input for the variable 'output' was not recognizable. Please use one of the following options: 'Report', 'Data'" )


    


In [82]:
def daily_report(data, date = datetime.datetime.today().strftime('%Y-%m-%d'), 
                 aws = "All", measurement = "All", scope = "Relative", output = "Report", aggregated = False):
    """
    Function to return the percentile of specified values given a selected date.
    
    Parameters:
    data (str): The dataset to be used.
    date (str): The date of the observations for which the percentile is calculated.
    aws (str): The automatic weather station.
    measurement (str): The measurement. 
    scope (str): Relative: values are compared to historical values of the day of year. 
                 Absolute: values are compared to historical values.
    aggregated (bool): False: output is not aggregated.
                       True: averages of stations are calculated.

    output (str): Output as pd.df or as printed report. 
  
    """
    
    # Ignore Warnings
    import warnings
    warnings.filterwarnings('ignore')

    # Load data
    if data == "GC Net":
        df = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/df_daily.gzip', engine='pyarrow')
    elif data == "Promice":
        df = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/promice_daily.gzip', engine='pyarrow')
    else: 
        raise ValueError("Only 'GC Net' & 'Promice' are accepted input values")
    
    # Align GC Net & PROMICE Columns  
    ############### TBD ##########
    station = "file"
    datetime = "Datetime"
    dayofcentury = "DayOfCentury"
    # List of columns to exclude from percentile calculation
    exclude = ['Year', 'MonthOfYear', 'DayOfMonth', 'HourOfDay(UTC)', 
               'DayOfYear', 'LongitudeGPS(degW)','HeightStakes(m)',
               'DayOfCentury', 'WindDirection(d)', 'TiltToEast(d)', 
               'TiltToNorth(d)', 'TimeGPS(hhmmssUTC)', 'LatitudeGPS(degN)', 
               'ElevationGPS(m)', 'HorDilOfPrecGPS', 'LoggerTemperature(C)',
               'FanCurrent(mA)', 'BatteryVoltage(V)', 'Month', 'Day', 'Hour',
       
              'air_temperature_1_max', 'air_temperature_1_min',
              'wind_speed_u1_max','wind_speed_u2_max',
              'wind_from_direction_1', 'wind_from_direction_2', 
              'height_wind_sensor_1', 'height_wind_sensor_2', 'battery_voltage',
              'shortwave_incoming_radiation_max',
              'shortwave_incoming_radiation_stdev', 'net_radiation_stdev',
              'air_temperature_2_max', 'air_temperature_2_min', 
              'wind_speed_u2_stdev', 'ref_temperature',   'wind_speed_u1_stdev',
              'net_radiation_maximum', 'season', 'year', 'month', 'DayOfYear',
              'DayOfCentury']

 
    #Account for Aggregate variable
    if aggregated == False:
        df = df
    elif aggregated == True:
        # Group df by station 
        df = df ##### TBD #######
    else: 
        raise ValueError(f"The input for the variable 'aggregated' was not recognizable. Please use one of the following options: True, False")

    # Split date input into year, month, day
    year = int(date[0:4])
    month = int(date[5:7])
    day = int(date[8:10])

    # Subset df with date
    date_df = df.loc[(df['Datetime'].dt.year == year) & (df['Datetime'].dt.month == month) & (df['Datetime'].dt.day == day)]
    
    # Find the day of year
    day =  date_df["DayOfYear"].mean()

    if scope == "Relative":
        # filter by calender day
        df = df.loc[df['DayOfYear'] == day]
    elif scope == "Absolute":
        df = df
    else: 
        raise ValueError("The input for the variable 'scope' was not recognizable. Please use one of the following options: 'Relative' , 'Absolute'")
    
    #Create a list of relevant columns
    columns = df.select_dtypes(include=[np.number]).columns.difference(exclude)

    if measurement == "All":
        df = df
    elif measurement in columns:
        # subset by measurement
        df = df[[station,datetime, measurement, dayofcentury]]
        #Update list of relevant column
        columns = df.select_dtypes(include=[np.number]).columns.difference(exclude)
    else: 
        raise ValueError(f"The input for the variable 'measurement' was not recognizable. Please use one of the following options: {columns}")

    #Create a list of unique files (stations) from the dataset 
    unique_files = list(df[station].unique())

    if aws == "All":
        df = df
    elif aws in unique_files:
        # filter by aws
        df = df.loc[df[station] == aws]
        #Update list of unique files (stations) from the dataset 
        unique_files = list(df[station].unique())
    else: 
        raise ValueError(f"The input for the variable 'aws' was not recognizable. Please use one of the following options: {unique_files}")
    
    # find day of century
    day_century = date_df[dayofcentury].mean() 

    #Create an empty list to hold the percentile values
    percentiles = []

    print("Calculating Percentiles .... ")
    #Loop through each file in the unique_files list
    for i, file in enumerate(unique_files):
        
        #Calculate the percentile of each numerical column for the specified datetime
        df_file = df[df[station] == file]

        #Create an empty dictionary
        percentile_dict = {}
        
        for col in columns:
          #Looping through each row of the dataframe
          for index, row in df_file.iterrows():
            row_date = row[dayofcentury]
            if row_date == day_century and ~ np.isnan(row[col]):
              # Retrieving Value 
              value = row[col]
              # Excluding NAN's for the calculation
              col_list = df_file[col].dropna().values.tolist()
              # Calculate the Percentiles
              percentile = stats.percentileofscore(col_list,value)
              #Count the number of values 
              count = len(col_list)
              # Assign file, value and, percentile to dictionary
              percentile_dict[col] = row[col]
              percentile_dict[f"{col}_pcte"] = percentile
              percentile_dict[f"{col}_n"] = count
              
        percentile_dict["Station"] = file

          #Add the percentile dictionary to the list
        percentiles.append({'Station': file,**percentile_dict})
    
    print("Finished Calculating Percentiles")
    
    #Create a dataframe from the list of dictionaries
    percentiles_df = pd.DataFrame(percentiles)

    print("Transforming Output...")
    
        # Define a list of all the columns in the original dataframe
    columns_list = percentiles_df.columns
    # Split the list into two parts based on which columns have '_pcte' and '_n' in the name
    century_list = [i for i in columns_list if '_pcte' in i]
    number_list = [i for i in columns_list if '_n' in i]
    # Select the columns which do not have '_pcte' and '_n'
    value_list = [i for i in columns_list if i not in century_list and i not in number_list and i not in "Station"] 
    # Build the new dataframe from the lists
    transformed_df = pd.DataFrame(columns=['Station', 'Variable', 'Percentile', 'Number of Comparison Values', 'Original Values'])
    # Loop through each entry in the original dataframe
    for row in percentiles_df.iterrows():
        # Take the Station value and loop through all of the remaining values
        station_val = row[1]['Station']
        for value, century, number in zip(value_list, century_list, number_list):
            # Create a new entry for the transformed_df
            new_entry = [station_val, value, row[1][century], row[1][number], row[1][value]]
            transformed_df.loc[len(transformed_df)] = new_entry
     
     # Filter out extreme values       
    transformed_df = transformed_df[(transformed_df["Percentile"] > 90) | (transformed_df["Percentile"] < 10)].reset_index()

    
    print("Finished")
    print("----------------------------------------------------------------------------------------------------------------------\n")

   # print(tabulate(percentiles_df, headers='keys', tablefmt='psql'))
   # return percentiles_df
    if output == "Report":
        from tabulate import tabulate
        #### Report OUTPUT ###########
        print(
          f"  Selected Date: {date} \n" ,
          f"Selected Station: {aws} \n" , 
          f"Selected Measurement: {measurement} \n"  , 
          f"Selected Data: {data} \n" ,
          f"Selected Scope: {scope} \n" ,
          "----------------------------------------------------------------------------------------------------------------------\n",
          f"                   Climatology Report\n" 
          )        
        print(tabulate(transformed_df, headers='keys', tablefmt='psql'))
       
        input_ = input("Do you want to include graphics? (Y/N)")
        if input_ == "Y":
          ########## TBD: Replace with boxplot function #################
          print("Functionality is in development")
   
    elif output == "Data":
      return transformed_df
    else: 
        raise ValueError("The input for the variable 'output' was not recognizable. Please use one of the following options: 'Report', 'Data'" )

  


In [83]:
daily_report("GC Net", "2019-01-12", scope='Relative', output = "Data")

Calculating Percentiles .... 
Finished Calculating Percentiles
Transforming Output...
Finished
----------------------------------------------------------------------------------------------------------------------



Unnamed: 0,index,Station,Variable,Percentile,Number of Comparison Values,Original Values
0,9,Swiss Camp 10m,relative_humidity_2,6.25,16.0,56.76
1,40,Swiss Camp,net_radiation,95.0,20.0,31.31
2,60,Swiss Camp,snow_temperature_4,94.117647,17.0,-4.75
3,64,Swiss Camp,snow_temperature_8,93.75,16.0,-4.65
4,65,Swiss Camp,snow_temperature_9,93.75,16.0,-7.21
5,67,Swiss Camp,surface_temperature_2,9.090909,11.0,-31.1
6,273,Tunu-N,air_temperature_1,9.090909,22.0,-51.1
7,274,Tunu-N,air_temperature_2,9.090909,22.0,-51.05
8,287,Tunu-N,snow_depth_1,100.0,15.0,7.82
9,288,Tunu-N,snow_depth_2,100.0,15.0,7.54


In [86]:
get_data(data = "GC Net", date = "2022-01-12", measurement= "air_temperature_1", scope='Relative', output = "Report", aws="NEEM")

  Selected Date: 2022-01-12 
 Selected Station: NEEM 
 Selected Measurement: air_temperature_1 
 Selected Data: GC Net 
 Selected Scope: Relative 
 ----------------------------------------------------------------------------------------------------------------------
                    Climatology Report

+--------+--------+---------------------+---------------------+----------------+-------------+
|        | file   | Datetime            |   air_temperature_1 |   DayOfCentury |   DayOfYear |
|--------+--------+---------------------+---------------------+----------------+-------------|
|  68752 | NEEM   | 2007-01-12 00:00:00 |              nan    |         732202 |          12 |
|  75322 | NEEM   | 2008-01-12 00:00:00 |              nan    |         732567 |          12 |
|  81958 | NEEM   | 2009-01-12 00:00:00 |              nan    |         732932 |          12 |
|  89404 | NEEM   | 2010-01-12 00:00:00 |              -24.08 |         733297 |          12 |
|  96823 | NEEM   | 2011-01-

In [None]:
def report(data, date, station, variable, scope):
    """
    Function to return the percentile of specified values given a selected date.
    
    Parameters:
    df (pd.DataFrame): The dataframe to be used.
    date (str): The date of the observations for which the percentile is calculated.
    
    Returns:
    df (pd.DataFrame): A dataframe with all numerical columns and the percentiles of the values of the selected date.
    """
  
    # Ignore Warnings
    import warnings
    warnings.filterwarnings('ignore')
  
    # Load data
    if data == "GC Net":
        df = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/df_daily.gzip', engine='pyarrow')
    elif data == "Promice":
        df = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/promice_daily.gzip', engine='pyarrow')
    else: 
        raise ValueError("Only 'GC Net' & 'Promice' are accepted input values")
  
    # Split date input into year, month, day
    year = int(date[0:4])
    month = int(date[5:7])
    day = int(date[8:10])
  
    # subset df with date and find day of year and day variable
    date_df = df.loc[(df['Datetime'].dt.year == year) & (df['Datetime'].dt.month == month) & (df['Datetime'].dt.day == day)]
      # select day of year
    day =  date_df["DayOfYear"].mean()
    datetime = date_df["Datetime"].max()
  
    # select the specific day of century and the related values
    day_century = date_df["DayOfCentury"].mean() 
    day_century_value = pd.DataFrame(pd.DataFrame.mean(date_df)).reset_index()
    day_century_value = pd.pivot_table(day_century_value, index=None, columns=['index'], aggfunc=max)
  
    if scope == "relative":
        # group by calender day
        df = df.loc[df['DayOfYear'] == day]
    elif scope == "absolute":
        df = df
    else: 
        raise ValueError("Only 'relative' & 'absolute' are accepted input values")
  
    # Find the index with the specified date
    row_index = int(df[df['Datetime'] == date].index[0])
    
    # Remove columns that do not contain numerical values & Subset df based on measure selection
    if variable == "All":
        df = df.select_dtypes(include=['int', 'float']).copy()
    else:
        df = pd.DataFrame({variable: df[variable]})
        df = df.select_dtypes(include=['int', 'float']).copy()
    
    # Create an empty dictionary for the output
    percentile_dict = {}
    
    # Iterate through the columns
    for col in df.columns:
        # Find the percentile of the value in the specified row and date
        value = df[col][row_index]
        percentile = df[col].rank(pct=True)[df[col] == value].iloc[0]*100 if not math.isnan(value) else math.nan
    
        # Add the percentile to the dictionary
        percentile_dict[col] = percentile
    
    # Create a dataframe with the output
    x = pd.DataFrame(percentile_dict, index=[0])
  
    import prettytable as pt
  
    table = pt.PrettyTable()
    table.field_names = ["Measurement", "Percentile"]
  
    for col in x.columns:
        if x[col].values > 90 or x[col].values < 10:
            table.add_row([f"\033[1m{col}\033[0m", f"\033[1m{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}%\033[0m"])
        else:
            table.add_row([col, f"{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}%"])
  
    print(
    f" Date: {date} \n" ,
    f"Station: {station} \n" , 
    f"Measurement: {variable} \n"  , 
    f"Data: {data} \n" ,
    "----------------------------------------------------------------------------------------------------------------------\n",
    f"                   Climatology Report\n" 
    )
    print(table)
    print("----------------------------------------------------------------------------------------------------------------------\n")
    
    import plotly.express as px
    # Create a list of columns to be plotted
    x = day_century_value
    columns_to_plot = [col for col in x.columns.values if col in df.columns.values]
  
    # Create a list of values from x to be highlighted
    values_to_highlight = x[columns_to_plot].values.flatten().tolist()
  
    # Create a list of subplots
    figs = []
    # Loop through list of columns
    for col, v in zip(columns_to_plot, values_to_highlight) : 
        # Create a subplot for each column 
        fig = px.box(df[col], orientation = "v",boxmode='group')
        # Format the axes
        fig.update_layout(title_text= f"Boxplot for {col}", xaxis_title='', yaxis_title='')
        # Highlight the values from x
        fig.add_scatter(x=[col], y=[v], name= f"Selected Value \n{col}",
    mode = 'markers',
    marker_symbol = 'circle-dot',
    marker_size = 8,
    marker_color = 'red')
        
        # Add figure to list of subplots 
        figs.append(fig)
    
    # Output
    for fig in figs:
        fig.show()

In [None]:
report("Promice", "2022-01-12", "THU_L_day_v03", variable = "All", scope = "relative")

GC Net Data

In [79]:
gc = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/df_daily.gzip', engine='pyarrow')

In [81]:
gc.columns

Index(['Datetime', 'file', 'shortwave_incoming_radiation',
       'shortwave_outgoing_radiation', 'net_radiation', 'air_temperature_1',
       'air_temperature_1_max', 'air_temperature_1_min',
       'air_temperature_cs500_air1', 'air_temperature_cs500_air2',
       'relative_humidity_1', 'relative_humidity_2', 'wind_speed_1',
       'wind_speed_u1_max', 'wind_speed_2', 'wind_speed_u2_max',
       'wind_from_direction_1', 'wind_from_direction_2', 'air_pressure',
       'height_wind_sensor_1', 'height_wind_sensor_2', 'battery_voltage',
       'air_temperature_2', 'snow_depth_1', 'snow_depth_2',
       'sensible_heat_flux', 'latent_heat_flux', 'air_temperature_2m',
       'relative_humidity_2m', 'wind_speed_10m', 'solar_zenith_angle',
       'solar_azimuth_angle', 'surface_albedo', 'relative_humidity_1_cor',
       'relative_humidity_2_cor', 'specific_humidity_1', 'specific_humidity_2',
       'shortwave_incoming_radiation_max',
       'shortwave_incoming_radiation_stdev', 'net_radiation

*Test on one station*

In [None]:
gc = gc[gc['station_name'] == "Humboldt"] 

Promice Data

In [68]:
pc = pd.read_parquet('/content/drive/MyDrive/Master_Thesis/data/promice_hourly.gzip', engine='pyarrow')

In [69]:
pc

Unnamed: 0,Year,MonthOfYear,DayOfMonth,HourOfDay(UTC),DayOfYear,DayOfCentury,AirPressure(hPa),AirTemperature(C),AirTemperatureHygroClip(C),RelativeHumidity(%),...,ElevationGPS(m),HorDilOfPrecGPS,LoggerTemperature(C),FanCurrent(mA),BatteryVoltage(V),file,Month,Day,Hour,Datetime
0,2008,7,22,0,204,3126,,,,,...,,,,,,SCO_L_hour_v03,7,22,0,2008-07-22 00:00:00
1,2008,7,22,1,204,3126,,,,,...,,,,,,SCO_L_hour_v03,7,22,1,2008-07-22 01:00:00
2,2008,7,22,2,204,3126,946.18,3.03,2.61,78.5,...,,,3.45,98.7,12.75,SCO_L_hour_v03,7,22,2,2008-07-22 02:00:00
3,2008,7,22,3,204,3126,946.44,2.97,2.43,81.5,...,443.1,1.16,2.42,120.9,12.80,SCO_L_hour_v03,7,22,3,2008-07-22 03:00:00
4,2008,7,22,4,204,3126,946.90,2.79,2.34,82.4,...,484.0,0.99,2.47,122.6,12.79,SCO_L_hour_v03,7,22,4,2008-07-22 04:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2489011,2022,1,12,19,12,8048,,,,,...,,,,,,KAN_B_hour_v03,1,12,19,2022-01-12 19:00:00
2489012,2022,1,12,20,12,8048,,,,,...,,,,,,KAN_B_hour_v03,1,12,20,2022-01-12 20:00:00
2489013,2022,1,12,21,12,8048,,,,,...,,,,,,KAN_B_hour_v03,1,12,21,2022-01-12 21:00:00
2489014,2022,1,12,22,12,8048,,,,,...,,,,,,KAN_B_hour_v03,1,12,22,2022-01-12 22:00:00


*Test Suite*

In [None]:
# define variables

# *Mandatory: Data
data = "Promice"

# *Mandatory: Date
date = "22-07-2008"

# Optional: Station
#station = "SCO_L_hour_v03"
station = "THU_L_day_v03"

# Optional: Measure
y = "All"

In [None]:
# subset dataframe 
pc = pc[pc['file'] == station] 

In [None]:
# select dato 
year= 2022
month = 1
day = 12

# subset df with date and find day of year and day variable
date_df = pc.loc[(pc['Datetime'].dt.year == year) & (pc['Datetime'].dt.month == month) & (pc['Datetime'].dt.day == day)]
  # select day of year
day =  date_df["DayOfYear"].mean()
datetime = date_df["Datetime"].max()

# select the specific day of century and the related values
day_century = date_df["DayOfCentury"].mean() 
day_century_value = pd.DataFrame(pd.DataFrame.mean(date_df)).reset_index()
day_century_value = pd.pivot_table(day_century_value, index=None, columns=['index'], aggfunc=max)

# group by calender day
pc_group = pc.loc[pc['DayOfYear'] == day]

  day_century_value = pd.DataFrame(pd.DataFrame.mean(date_df)).reset_index()
  day_century_value = pd.DataFrame(pd.DataFrame.mean(date_df)).reset_index()


In [None]:
def get_percentile(df, date):
  """
  Function to return the percentile of specified values given a selected date.
  
  Parameters:
  df (pd.DataFrame): The dataframe to be used.
  date (str): The date of the observations for which the percentile is calculated.
  
  Returns:
  df (pd.DataFrame): A dataframe with all numerical columns and the percentiles of the values of the selected date.
  """

  # Find the index with the specified date
  row_index = int(df[df['Datetime'] == date].index[0])
  
  # Remove columns that do not contain numerical values & Subset df based on measure selection
  if y == "All":
    df = df.select_dtypes(include=['int', 'float']).copy()
  else:
    df = pd.DataFrame(df[y])
    df = df.select_dtypes(include=['int', 'float']).copy()
  
  # Create an empty dictionary for the output
  percentile_dict = {}
  
  # Iterate through the columns
  for col in df.columns:
    # Find the percentile of the value in the specified row and date
    value = df[col][row_index]
    if(math.isnan(value)): 
      percentile = math.nan
    else: 
      # calculate the percentile with df.rank()
      percentile = df[col].rank(pct=True)[df[col] == value].iloc[0]*100
    
    # Add the percentile to the dictionary
    #percentile_dict[col + "_percentile"] = percentile
    percentile_dict[col] = percentile
  
  
  # Create a dataframe with the output
  df_percentiles = pd.DataFrame(percentile_dict, index=[0])
  return df_percentiles

In [None]:
x = get_percentile(pc, "2022-01-12")

In [None]:
#x = x.append(day_century_value.iloc[0], ignore_index=True)

In [None]:
x = get_percentile(pc_group, "2022-01-12")

In [None]:
import prettytable as pt

table = pt.PrettyTable()
table.field_names = ["Measurement", "Percentile"]

for col in x.columns:
    if x[col].values > 90 or x[col].values < 10:
        table.add_row([f"\033[1m{col}\033[0m", f"\033[1m{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}% \033[0m"])
    else:
        table.add_row([col, f"{int(x[col].values) if not np.isnan(x[col].values) else 'NaN'}%"])


print(
f" Date: {date} \n" ,
f"Station: {station} \n" , 
f"Measurment: {y} \n"  , 
f"Data: {data} \n" ,
"-----------------------------------------------------------\n",
f"                   Climatology \n" 
)

print(table)

*Visual test suite*

In [None]:
############################ Boxplots #########################################
ff = pc 
x = ff.loc[ff['Datetime'] == "2022-01-12"]

import plotly.express as px

# Create a list of columns to be plotted
columns_to_plot = [col for col in x.columns.values if col in ff.columns.values]

# Create a list of values from x to be highlighted
values_to_highlight = x[columns_to_plot].values.flatten().tolist()

# Create a list of subplots
figs = []
# Loop through list of columns
for col, v in zip(columns_to_plot,values_to_highlight) : 
  # Create a subplot for each column 
  fig = px.box(ff[col], orientation = "v",boxmode='group')
  # Format the axes
  fig.update_layout(title_text= f"Boxplot for {col}", xaxis_title='', yaxis_title='')
  # Highlight the values from x
  fig.add_scatter(x=[col], y=[v], name= f"Selected Value \n{col}",
                        mode = 'markers',
                        marker_symbol = 'circle-dot',
                        marker_size = 8,
                        marker_color = 'red')
  
  # Add figure to list of subplots 
  figs.append(fig)

# Show the plots
for fig in figs:
  fig.show()

In [None]:
# A function that calculates the percentiles of every column and their values

def percentile_df(df):
    for col in (df.columns):
        df[f'{col}_pcta'] = df[col].rank(pct=True)
        #df[f'{col}_pcta'] = df[col].rank(pct=True)[df[col] == value] *100

    return df

In [None]:
gg = percentile_df(pc)
gg = gg[["Datetime", "AirTemperature(C)", "AirTemperature(C)_pcta"]]
y = "AirTemperature(C)"
y_pcta = "AirTemperature(C)_pcta"

In [None]:
fig = go.Figure([
    go.Scatter(
        name='Air Pressure (hPa)',
        x=gg['Datetime'],
        y=gg[y],
        mode='lines',
        line=dict(color='rgb(31, 119, 180)'),
    ),
    go.Scatter(
        name='Upper Bound (20-80)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta])),
        mode='lines',
        marker=dict(color="#00BB00"),
        line=dict(width=0),
        showlegend=False
    ),
    go.Scatter(
        name='Lower Bound (20-80)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta])),
        marker=dict(color="#00BB00"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(0, 187, 0, 0.3)',
        fill='tonexty',
        showlegend=False
    ),
    go.Scatter(
        name='Upper Bound (0-20 & 80-100)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta]  )),
        mode='lines',
        marker=dict(color="#BB0000"),
        line=dict(width=0),
        showlegend=False
    ),
    go.Scatter(
        name='Lower Bound (0-20 & 80-100)',
        x=gg['Datetime'],
        y=(gg[y] * (gg[y_pcta] )),
        marker=dict(color="#BB0000"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(187, 0, 0, 0.3)',
        fill='tonexty',
        showlegend=False
    )
])
fig.update_layout(
    yaxis_title='Air Pressure (hPa)',
    title='Continuous, variable value error bars',
    hovermode="x"
)
fig.show()

In [None]:
gg.loc[(gg['Datetime'].dt.month == 12) & (gg['Datetime'].dt.day == 31)]