In [None]:
# Welcome to the Lahmas Lab Lascar Data Processor!
# Please answer the questions below and run your code directly in a jupyter notebook.

# Where is the folder you would like to process?
folder_path = r'0_Llanwx_merged' # Use 'copy path' within Jupyter Lab

# What are the labels for each variable in the .csv files?
# NOTE ! : If the label has a speciel character in it you may have to open the file in 'Editor' to copy and paste the symbol.
time_name = 'Time' # Label of time variable
temp_name = 'Celsius(�C)' # Label of temperature variable
rh_name = 'Humidity(%rh)' # Label of relative humidity variable
td_name = 'Dew Point(�C)' # Label of dew point temperature variable

# What is the date format? (eg. %Y-%m-%d %H:%M:%S)
date_format = '%Y-%m-%d %H:%M:%S'

# What will the name be of your new files folder?
new_files_name = 'lascar_files'
# What will the name be of your new plots folder?
new_plots_name = 'lascar_plots'

In [None]:
# Modules

# Dataframe Processing
import numpy as np
import pandas as pd
import os
import io
from datetime import datetime

# Math
from scipy.stats import linregress
from statsmodels.tsa.seasonal import seasonal_decompose

# Plotting
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.lines import Line2D

In [None]:
# Global Variable Editor - DO NOT CHANGE UNLESS YOU ARE SURE
# This block holds all standardized global variables and formatting styles for the output files.

# Standard Date Format
std_date_format = '%Y-%m-%d %H:%M:%S'

# Dataframe Label Names
std_time_name = 'Datetime (YYYY-MM-DD HH:mm:ss)'
std_temp_name = 'Temperature (deg C)'
std_rh_name = 'RH (%)'
std_td_name = 'Dew Point (deg C)'
rh_cor_name = 'RH Corrected (%)'
td_cor_name = 'Dew Point Corrected (deg C)'

# Plot labels
plot_time_label = 'Time (YYYY/MM)'
plot_temp_label = std_temp_name
plot_rh_label = std_rh_name

# Time Index Name
time_index = 'Time Index'

# Daily minimum points for validity (100% = 24; 75% = 18; 50% = 12; ...)
daily_min_points = 18 # 75%

# Weekly minimum points for validity (100% = 168; 75% = 126; 50% = 84; ...)
weekly_min_points = 84 # 50%

# Monthly minimum points for validity (for 30 day month --> 100% = 720; 75% = 540; 50% = 360; ...)
monthly_min_points = 360 # 50%

# Yearly minimum points for validity (for 365 day year --> 100% = 8760; 75% = 6570; 50% = 4380; ...)
year_min_points = 4380 # 50%

# Night-time time range as integers
night_range=[23, 0, 1, 2, 3, 4, 5]

In [None]:
# Creating the new folder where all the processed files will end up.

files_name = '1_' + new_files_name

if os.path.exists(files_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(files_name)
    print('\nYour new folder is:\n"', files_name, '"\n\nRun next block\n')

In [None]:
# Creating the new folder where all the plots will end up.

plots_name = '2_' + new_plots_name

if os.path.exists(plots_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(plots_name)
    print('\nYour new folder is:\n"', plots_name, '"\n\nRun next block\n')

In [None]:
# File initialization function

def file_initialize(file_path, time_name, rh_name, td_name):

    ### INITIALIZATION
    
    # Prints selected file name
    print("Initializing file:", os.path.basename(file_path), '\n')

    # Converting .csv in ANSI encoding to UTF-8 encoding
    try:
        df = pd.read_csv(file_path, encoding='ANSI')
        df.to_csv(file_path, encoding='utf-8', index=False)
        data = pd.read_csv(file_path)

    except:
        data = pd.read_csv(file_path)
    
    # Skips first values as they may have been taken & tainted during installation
    data = data.iloc[5:].reset_index(drop=True)
    
    # Changes time string to datetime type
    data[time_index] = pd.to_datetime(data[time_name], format = date_format)
    
    # Changing date format to standard
    data[time_index] = pd.to_datetime(data[time_index]).dt.strftime(std_date_format)
    
    # Updating the time column to standard datetime format
    data[time_name] = data[time_index]
    
    
    ## RH AND TD CORRECTION
    
    # Creating lists to insert corrected RH and Td values
    RH_cor = [None] * len(data)
    Td_cor = [None] * len(data)
    
    # Iterating through rows to update RH values out of 0-100% range and Td values.
    for i in range(0,len(data)):
        if data.loc[i, rh_name] > 100:
            RH_cor[i] = 100 # RH is adjusted to 100% as it must be saturated
            Td_cor[i] = data.loc[i, temp_name] # Td is equal to T
        
        else: # Everything stays the same
            RH_cor[i] = data.loc[i, rh_name]
            Td_cor[i] = data.loc[i, td_name]
            
    # Creating columns for the lists to merge into the dataframe
    data[rh_cor_name] = RH_cor
    data[td_cor_name] = Td_cor


    ## RENAMING LABELS TO STANDARD
    dataframe = data
    dataframe.rename(columns={time_name: std_time_name,
                              temp_name: std_temp_name,
                              rh_name: std_rh_name,
                              td_name: std_td_name}, inplace=True)
    

    ## SETTING UP INDEXING FOR SUMMARY CALCULATIONS

    # Creating initial index
    dataframe[time_index] = pd.to_datetime(dataframe[std_time_name])
    dataframe = dataframe.set_index(time_index, drop=False)
    
    dataframe = dataframe[~dataframe.index.duplicated(keep='first')]
    
    # Define full time index range at hourly resolution
    full_index = pd.date_range(start=dataframe.index.min(),
                               end=dataframe.index.max(),
                               freq='h')

    # Create dataframe with full index
    dataframe=dataframe.reindex(full_index)
    dataframe['present'] = ~dataframe[time_index].isna()


    ## WEEKLY VALID/BAD WEEK CALCULATIONS
    
    # Filtering for week
    dataframe['day']=dataframe.index.to_period('D')

    # Finding the total number of valid hours in the week
    actual_counts_daily = dataframe.groupby('day')['present'].sum()

    # Creating boolean index indicating which months have enough data points in them to be valid for avg, min, max calculations
    valid_days = actual_counts_daily[actual_counts_daily >= daily_min_points].index
    bad_days = actual_counts_daily[actual_counts_daily < daily_min_points].index

    
    ## WEEKLY VALID/BAD WEEK CALCULATIONS
    
    # Filtering for week
    dataframe['week']=dataframe.index.to_period('W')

    # Finding the total number of valid hours in the week
    actual_counts_weekly = dataframe.groupby('week')['present'].sum()

    # Creating boolean index indicating which months have enough data points in them to be valid for avg, min, max calculations
    valid_weeks = actual_counts_weekly[actual_counts_weekly >= weekly_min_points].index
    bad_weeks = actual_counts_weekly[actual_counts_weekly < weekly_min_points].index
    

    ## MONTHLY VALID/BAD MONTH CALCULATIONS
    
    # Filtering for months
    dataframe['month']=dataframe.index.to_period('M')
    
    # Finding the number of data points and the total amount of hours in the month
    actual_counts_monthly = dataframe.groupby('month')['present'].sum()

    # Creating boolean index indicating which months have enough data points in them to be valid for avg, min, max calculations
    valid_months = actual_counts_monthly[actual_counts_monthly >= monthly_min_points].index
    bad_months = actual_counts_monthly[actual_counts_monthly < monthly_min_points].index


    ## YEARLY VALID/BAD YEAR CALCULATIONS

    # Filtering for year
    dataframe['year'] = dataframe.index.to_period('Y')
    
    # Finding the total number of valid hours in the year
    actual_counts_yearly = dataframe.groupby('year')['present'].sum()
    
    # Filter valid and bad years
    valid_years = actual_counts_yearly[actual_counts_yearly >= year_min_points].index
    bad_years = actual_counts_yearly[actual_counts_yearly < year_min_points].index

    '''
    ## DAILY, WEEKLY, MONTHLY SUMMARIES FOR TEMPERATURE

    # Had to make it in DatetimeIndex type for some reason (didn't fully understand why...)
    dataframe.index = pd.DatetimeIndex(dataframe.index)
    
    # Resample by day and calculate daily min, max, avg for temperature
    daily_summary_T = dataframe[std_temp_name].resample('D').agg(['mean', 'min', 'max']).reset_index()
    daily_summary_T.columns = [std_time_name, 'T_avg', 'T_min', 'T_max']

    # Had to change back to datetime type for some reason (didn't fully understand why...)
    daily_summary_T[std_time_name] = pd.to_datetime(daily_summary_T[std_time_name])
    
    # Resample by week and calculate monthly min, max, avg for temperature
    weekly_summary_T = daily_summary_T.resample('W', on=std_time_name).agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'}).reset_index() 

    # Creating week start points
    weekly_summary_T['WeekPeriod'] = weekly_summary_T[std_time_name].dt.to_period('W')
    
    # Filtering the rows using valid_weeks (boolean) to only calculations from valid weeks
    weekly_summary_T = weekly_summary_T[weekly_summary_T['WeekPeriod'].isin(valid_weeks)]
    
    # Resample by month and calculate monthly min, max, avg for temperature
    monthly_summary_T = daily_summary_T.resample('ME', on=std_time_name).agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'}).reset_index() 

    # Creating month start points
    monthly_summary_T['MonthPeriod'] = monthly_summary_T[std_time_name].dt.to_period('M')
    
    # Filtering the rows using valid_months (boolean) to only calculations from valid months
    monthly_summary_T = monthly_summary_T[monthly_summary_T['MonthPeriod'].isin(valid_months)]


    ## DAILY, WEEKLY, MONTHLY SUMMARIES FOR RELATIVE HUMIDITY (CORRECTED)
    
    # Resample by day and calculate daily min, max, avg for relative humidity
    daily_summary_rh = dataframe[rh_cor_name].resample('D').agg(['mean', 'min', 'max']).reset_index()
    daily_summary_rh.columns = [std_time_name, 'RH_avg', 'RH_min', 'RH_max']

    # Had to change back to datetime type again for some reason (didn't fully understand why...)
    daily_summary_rh[std_time_name] = pd.to_datetime(daily_summary_rh[std_time_name])

    # Resample by week and calculate monthly min, max, avg for relative humidity
    weekly_summary_rh = daily_summary_rh.resample('W', on=std_time_name).agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'}).reset_index() 

    # Creating week start points
    weekly_summary_rh['WeekPeriod'] = weekly_summary_rh[std_time_name].dt.to_period('W')
    
    # Filtering the rows using valid_weeks (boolean) to only calculations from valid weeks
    weekly_summary_rh = weekly_summary_rh[weekly_summary_rh['WeekPeriod'].isin(valid_weeks)]
    
    # Resample by month and calculate monthly min, max, avg for relative humidity
    monthly_summary_rh = daily_summary_rh.resample('ME', on=std_time_name).agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'}).reset_index()

    # Creating month start points
    monthly_summary_rh['MonthPeriod'] = monthly_summary_rh[std_time_name].dt.to_period('M')

    # Filtering the rows using valid_months (boolean) to only calculations from valid months
    monthly_summary_rh = monthly_summary_rh[monthly_summary_rh['MonthPeriod'].isin(valid_months)]
    
    
    return data, monthly_summary_T, monthly_summary_rh, weekly_summary_T, weekly_summary_rh, daily_summary_T, daily_summary_rh, bad_months, bad_weeks, bad_days
    '''


    ### FILTERING OUT THE BAD MONTHS FROM THE DATAFRAME
    
    # Creating Nan template to input where there are bad months
    cols_to_nan = [std_temp_name, std_rh_name, std_td_name]

    # Creating temperature and relative humidity dataframes with all bad month datapoints replaced with Nan
    dataframe_temp_daily = dataframe.copy()
    dataframe_temp_daily.loc[~dataframe['day'].isin(valid_days), cols_to_nan] = np.nan
    dataframe_temp_daily[std_time_name] = pd.to_datetime(dataframe_temp_daily[std_time_name], format = date_format)
    
    dataframe_rh_daily = dataframe.copy()
    dataframe_rh_daily.loc[~dataframe['day'].isin(valid_days), [std_rh_name]] = np.nan
    dataframe_rh_daily[std_time_name] = pd.to_datetime(dataframe_rh_daily[std_time_name], format = date_format)
    
    # Creating temperature and relative humidity dataframes with all bad month datapoints replaced with Nan
    dataframe_temp_weekly = dataframe.copy()
    dataframe_temp_weekly.loc[~dataframe['week'].isin(valid_weeks), cols_to_nan] = np.nan
    dataframe_temp_weekly[std_time_name] = pd.to_datetime(dataframe_temp_weekly[std_time_name], format = date_format)
    
    dataframe_rh_weekly = dataframe.copy()
    dataframe_rh_weekly.loc[~dataframe['week'].isin(valid_weeks), [std_rh_name]] = np.nan
    dataframe_rh_weekly[std_time_name] = pd.to_datetime(dataframe_rh_weekly[std_time_name], format = date_format)
    
    # Creating temperature and relative humidity dataframes with all bad month datapoints replaced with Nan
    dataframe_temp_monthly = dataframe.copy()
    dataframe_temp_monthly.loc[~dataframe['month'].isin(valid_months), cols_to_nan] = np.nan
    dataframe_temp_monthly[std_time_name] = pd.to_datetime(dataframe_temp_monthly[std_time_name], format = date_format)
    
    dataframe_rh_monthly = dataframe.copy()
    dataframe_rh_monthly.loc[~dataframe['month'].isin(valid_months), [std_rh_name]] = np.nan
    dataframe_rh_monthly[std_time_name] = pd.to_datetime(dataframe_rh_monthly[std_time_name], format = date_format)
    
    
    ### CREATING DATAFRAMES FOR PLOTTING AND CALCULATING RELEVANT STATISTICS

    
    ## DAILY TEMPERATURE

    # Creating dataframe with mean, min, max of each week
    daily_temp = dataframe_temp_daily.resample('D', on=std_time_name)[std_temp_name].agg(['mean', 'min', 'max']).reset_index()
    daily_temp['day'] = pd.to_datetime(daily_temp[std_time_name]).dt.to_period('D')
    daily_temp['day'] = daily_temp['day'].dt.to_timestamp()
    
    # Mask of rows where all values are non-NaN
    daily_temp_no_nan = daily_temp[['mean', 'min', 'max']].notna().all(axis=1)
    
    # x and y values using only valid data
    daily_temp_x = mdates.date2num(daily_temp.loc[daily_temp_no_nan, std_time_name])
    daily_temp_y_avg = daily_temp.loc[daily_temp_no_nan, 'mean']
    daily_temp_y_min = daily_temp.loc[daily_temp_no_nan, 'min']
    daily_temp_y_max = daily_temp.loc[daily_temp_no_nan, 'max']
    
    # Linear regression
    daily_temp_result_avg = linregress(daily_temp_x, daily_temp_y_avg)
    daily_temp_result_min = linregress(daily_temp_x, daily_temp_y_min)
    daily_temp_result_max = linregress(daily_temp_x, daily_temp_y_max)
    
    # Compute trends (on full time axis)
    daily_temp_x_full = mdates.date2num(daily_temp[std_time_name])
    daily_temp_trend_avg = daily_temp_result_avg.slope * daily_temp_x_full + daily_temp_result_avg.intercept
    daily_temp_trend_min = daily_temp_result_min.slope * daily_temp_x_full + daily_temp_result_min.intercept
    daily_temp_trend_max = daily_temp_result_max.slope * daily_temp_x_full + daily_temp_result_max.intercept
    
    # Annualized slope (365 days per year)
    daily_temp_slope_avg_per_year = daily_temp_result_avg.slope * 365
    daily_temp_slope_min_per_year = daily_temp_result_min.slope * 365
    daily_temp_slope_max_per_year = daily_temp_result_max.slope * 365
    
    # Daily variance
    daily_temp_variance_avg = (daily_temp_result_avg.stderr**2) * len(daily_temp_y_avg)
    daily_temp_variance_min = (daily_temp_result_min.stderr**2) * len(daily_temp_y_min)
    daily_temp_variance_max = (daily_temp_result_max.stderr**2) * len(daily_temp_y_max)

    
    ## WEEKLY TEMPERATURE

    # Creating dataframe with mean, min, max of each week
    weekly_temp = dataframe_temp_weekly.resample('W', on=std_time_name)[std_temp_name].agg(['mean', 'min', 'max']).reset_index()
    weekly_temp['week'] = pd.to_datetime(weekly_temp[std_time_name]).dt.to_period('W')
    weekly_temp['week'] = weekly_temp['week'].dt.to_timestamp()
    
    # Mask of rows where all values are non-NaN
    weekly_temp_no_nan = weekly_temp[['mean', 'min', 'max']].notna().all(axis=1)
    
    # x and y values using only valid data
    weekly_temp_x = mdates.date2num(weekly_temp.loc[weekly_temp_no_nan, std_time_name])
    weekly_temp_y_avg = weekly_temp.loc[weekly_temp_no_nan, 'mean']
    weekly_temp_y_min = weekly_temp.loc[weekly_temp_no_nan, 'min']
    weekly_temp_y_max = weekly_temp.loc[weekly_temp_no_nan, 'max']
    
    # Linear regression
    weekly_temp_result_avg = linregress(weekly_temp_x, weekly_temp_y_avg)
    weekly_temp_result_min = linregress(weekly_temp_x, weekly_temp_y_min)
    weekly_temp_result_max = linregress(weekly_temp_x, weekly_temp_y_max)
    
    # Compute trends (on full time axis)
    weekly_temp_x_full = mdates.date2num(weekly_temp[std_time_name])
    weekly_temp_trend_avg = weekly_temp_result_avg.slope * weekly_temp_x_full + weekly_temp_result_avg.intercept
    weekly_temp_trend_min = weekly_temp_result_min.slope * weekly_temp_x_full + weekly_temp_result_min.intercept
    weekly_temp_trend_max = weekly_temp_result_max.slope * weekly_temp_x_full + weekly_temp_result_max.intercept
    
    # Optional: annualized slope (assuming 52 weeks per year)
    weekly_temp_slope_avg_per_year = weekly_temp_result_avg.slope * 365
    weekly_temp_slope_min_per_year = weekly_temp_result_min.slope * 365
    weekly_temp_slope_max_per_year = weekly_temp_result_max.slope * 365
    
    # Weekly variance
    weekly_temp_variance_avg = (weekly_temp_result_avg.stderr**2) * len(weekly_temp_y_avg)
    weekly_temp_variance_min = (weekly_temp_result_min.stderr**2) * len(weekly_temp_y_min)
    weekly_temp_variance_max = (weekly_temp_result_max.stderr**2) * len(weekly_temp_y_max)

    
    ## MONTHLY TEMPERATURE
    
    # Creating dataframe with mean, min, max of each week
    monthly_temp = dataframe_temp_monthly.resample('ME', on=std_time_name)[std_temp_name].agg(['mean', 'min', 'max']).reset_index()
    monthly_temp['month'] = pd.to_datetime(monthly_temp[std_time_name]).dt.to_period('M')
    monthly_temp['month'] = monthly_temp['month'].dt.to_timestamp()
    
    # Mask of rows where all values are non-NaN
    monthly_temp_no_nan = monthly_temp[['mean', 'min', 'max']].notna().all(axis=1)
    
    # x and y values using only valid data
    monthly_temp_x = mdates.date2num(monthly_temp.loc[monthly_temp_no_nan, std_time_name])
    monthly_temp_y_avg = monthly_temp.loc[monthly_temp_no_nan, 'mean']
    monthly_temp_y_min = monthly_temp.loc[monthly_temp_no_nan, 'min']
    monthly_temp_y_max = monthly_temp.loc[monthly_temp_no_nan, 'max']
    
    # Linear regression
    monthly_temp_result_avg = linregress(monthly_temp_x, monthly_temp_y_avg)
    monthly_temp_result_min = linregress(monthly_temp_x, monthly_temp_y_min)
    monthly_temp_result_max = linregress(monthly_temp_x, monthly_temp_y_max)
    
    # Compute trends (on full time axis)
    monthly_temp_x_full = mdates.date2num(monthly_temp[std_time_name])
    monthly_temp_trend_avg = monthly_temp_result_avg.slope * monthly_temp_x_full + monthly_temp_result_avg.intercept
    monthly_temp_trend_min = monthly_temp_result_min.slope * monthly_temp_x_full + monthly_temp_result_min.intercept
    monthly_temp_trend_max = monthly_temp_result_max.slope * monthly_temp_x_full + monthly_temp_result_max.intercept
    
    # Optional: annualized slope
    monthly_temp_slope_avg_per_year = monthly_temp_result_avg.slope * 365
    monthly_temp_slope_min_per_year = monthly_temp_result_min.slope * 365
    monthly_temp_slope_max_per_year = monthly_temp_result_max.slope * 365
    
    # Monthly variance
    monthly_temp_variance_avg = (monthly_temp_result_avg.stderr**2) * len(monthly_temp_y_avg)
    monthly_temp_variance_min = (monthly_temp_result_min.stderr**2) * len(monthly_temp_y_min)
    monthly_temp_variance_max = (monthly_temp_result_max.stderr**2) * len(monthly_temp_y_max)

    
    ## DAILY RELATIVE HUMIDITY

    # Creating dataframe with mean, min, max of each week
    daily_rh = dataframe_rh_daily.resample('D', on=std_time_name)[std_rh_name].agg(['mean', 'min', 'max']).reset_index()
    daily_rh['day'] = pd.to_datetime(daily_rh[std_time_name]).dt.to_period('D')
    daily_rh['day'] = daily_rh['day'].dt.to_timestamp()
    
    # Mask of rows where all values are non-NaN
    daily_rh_no_nan = daily_rh[['mean', 'min', 'max']].notna().all(axis=1)
    
    # x and y values using only valid data
    daily_rh_x = mdates.date2num(daily_rh.loc[daily_rh_no_nan, std_time_name])
    daily_rh_y_avg = daily_rh.loc[daily_rh_no_nan, 'mean']
    daily_rh_y_min = daily_rh.loc[daily_rh_no_nan, 'min']
    daily_rh_y_max = daily_rh.loc[daily_rh_no_nan, 'max']
    
    # Linear regression
    daily_rh_result_avg = linregress(daily_rh_x, daily_rh_y_avg)
    daily_rh_result_min = linregress(daily_rh_x, daily_rh_y_min)
    daily_rh_result_max = linregress(daily_rh_x, daily_rh_y_max)
    
    # Compute trends (on full time axis)
    daily_rh_x_full = mdates.date2num(daily_rh[std_time_name])
    daily_rh_trend_avg = daily_rh_result_avg.slope * daily_rh_x_full + daily_rh_result_avg.intercept
    daily_rh_trend_min = daily_rh_result_min.slope * daily_rh_x_full + daily_rh_result_min.intercept
    daily_rh_trend_max = daily_rh_result_max.slope * daily_rh_x_full + daily_rh_result_max.intercept
    
    # Annualized slope
    daily_rh_slope_avg_per_year = daily_rh_result_avg.slope * 365
    daily_rh_slope_min_per_year = daily_rh_result_min.slope * 365
    daily_rh_slope_max_per_year = daily_rh_result_max.slope * 365
    
    # Daily variance
    daily_rh_variance_avg = (daily_rh_result_avg.stderr**2) * len(daily_rh_y_avg)
    daily_rh_variance_min = (daily_rh_result_min.stderr**2) * len(daily_rh_y_min)
    daily_rh_variance_max = (daily_rh_result_max.stderr**2) * len(daily_rh_y_max)

    
    ## WEEKLY RELATIVE HUMIDITY

    # Creating dataframe with mean, min, max of each week
    weekly_rh = dataframe_rh_weekly.resample('W', on=std_time_name)[std_rh_name].agg(['mean', 'min', 'max']).reset_index()
    weekly_rh['week'] = pd.to_datetime(weekly_rh[std_time_name]).dt.to_period('W')
    weekly_rh['week'] = weekly_rh['week'].dt.to_timestamp()
    
    # Mask of rows where all values are non-NaN
    weekly_rh_no_nan = weekly_rh[['mean', 'min', 'max']].notna().all(axis=1)
    
    # x and y values using only valid data
    weekly_rh_x = mdates.date2num(weekly_rh.loc[weekly_rh_no_nan, std_time_name])
    weekly_rh_y_avg = weekly_rh.loc[weekly_rh_no_nan, 'mean']
    weekly_rh_y_min = weekly_rh.loc[weekly_rh_no_nan, 'min']
    weekly_rh_y_max = weekly_rh.loc[weekly_rh_no_nan, 'max']
    
    # Linear regression
    weekly_rh_result_avg = linregress(weekly_rh_x, weekly_rh_y_avg)
    weekly_rh_result_min = linregress(weekly_rh_x, weekly_rh_y_min)
    weekly_rh_result_max = linregress(weekly_rh_x, weekly_rh_y_max)
    
    # Compute trends (on full time axis)
    weekly_rh_x_full = mdates.date2num(weekly_rh[std_time_name])
    weekly_rh_trend_avg = weekly_rh_result_avg.slope * weekly_rh_x_full + weekly_rh_result_avg.intercept
    weekly_rh_trend_min = weekly_rh_result_min.slope * weekly_rh_x_full + weekly_rh_result_min.intercept
    weekly_rh_trend_max = weekly_rh_result_max.slope * weekly_rh_x_full + weekly_rh_result_max.intercept
    
    # Optional: annualized slope
    weekly_rh_slope_avg_per_year = weekly_rh_result_avg.slope * 365
    weekly_rh_slope_min_per_year = weekly_rh_result_min.slope * 365
    weekly_rh_slope_max_per_year = weekly_rh_result_max.slope * 365
    
    # Weekly variance
    weekly_rh_variance_avg = (weekly_rh_result_avg.stderr**2) * len(weekly_rh_y_avg)
    weekly_rh_variance_min = (weekly_rh_result_min.stderr**2) * len(weekly_rh_y_min)
    weekly_rh_variance_max = (weekly_rh_result_max.stderr**2) * len(weekly_rh_y_max)


    ## MONTHLY RELATIVE HUMIDITY
    
    # Creating dataframe with mean, min, max of each week
    monthly_rh = dataframe_rh_monthly.resample('ME', on=std_time_name)[std_rh_name].agg(['mean', 'min', 'max']).reset_index()
    monthly_rh['month'] = pd.to_datetime(monthly_rh[std_time_name]).dt.to_period('M')
    monthly_rh['month'] = monthly_rh['month'].dt.to_timestamp()
    
    # Mask of rows where all values are non-NaN
    monthly_rh_no_nan = monthly_rh[['mean', 'min', 'max']].notna().all(axis=1)
    
    # x and y values using only valid data
    monthly_rh_x = mdates.date2num(monthly_rh.loc[monthly_rh_no_nan, std_time_name])
    monthly_rh_y_avg = monthly_rh.loc[monthly_rh_no_nan, 'mean']
    monthly_rh_y_min = monthly_rh.loc[monthly_rh_no_nan, 'min']
    monthly_rh_y_max = monthly_rh.loc[monthly_rh_no_nan, 'max']
    
    # Linear regression
    monthly_rh_result_avg = linregress(monthly_rh_x, monthly_rh_y_avg)
    monthly_rh_result_min = linregress(monthly_rh_x, monthly_rh_y_min)
    monthly_rh_result_max = linregress(monthly_rh_x, monthly_rh_y_max)
    
    # Compute trends (on full time axis)
    monthly_rh_x_full = mdates.date2num(monthly_rh[std_time_name])
    monthly_rh_trend_avg = monthly_rh_result_avg.slope * monthly_rh_x_full + monthly_rh_result_avg.intercept
    monthly_rh_trend_min = monthly_rh_result_min.slope * monthly_rh_x_full + monthly_rh_result_min.intercept
    monthly_rh_trend_max = monthly_rh_result_max.slope * monthly_rh_x_full + monthly_rh_result_max.intercept
    
    # Optional: annualized slope
    monthly_rh_slope_avg_per_year = monthly_rh_result_avg.slope * 365
    monthly_rh_slope_min_per_year = monthly_rh_result_min.slope * 365
    monthly_rh_slope_max_per_year = monthly_rh_result_max.slope * 365
    
    # Monthly variance
    monthly_rh_variance_avg = (monthly_rh_result_avg.stderr**2) * len(monthly_rh_y_avg)
    monthly_rh_variance_min = (monthly_rh_result_min.stderr**2) * len(monthly_rh_y_min)
    monthly_rh_variance_max = (monthly_rh_result_max.stderr**2) * len(monthly_rh_y_max)

    return data, dataframe, \
    daily_temp, \
    daily_temp_trend_avg, daily_temp_trend_min, daily_temp_trend_max, \
    daily_temp_slope_avg_per_year, daily_temp_result_avg, \
    daily_temp_slope_min_per_year, daily_temp_result_min, \
    daily_temp_slope_max_per_year, daily_temp_result_max, \
    weekly_temp, \
    weekly_temp_trend_avg, weekly_temp_trend_min, weekly_temp_trend_max, \
    weekly_temp_slope_avg_per_year, weekly_temp_result_avg, \
    weekly_temp_slope_min_per_year, weekly_temp_result_min, \
    weekly_temp_slope_max_per_year, weekly_temp_result_max,\
    monthly_temp, \
    monthly_temp_trend_avg, monthly_temp_trend_min, monthly_temp_trend_max, \
    monthly_temp_slope_avg_per_year, monthly_temp_result_avg, \
    monthly_temp_slope_min_per_year, monthly_temp_result_min, \
    monthly_temp_slope_max_per_year, monthly_temp_result_max, \
    daily_rh, \
    daily_rh_trend_avg, daily_rh_trend_min, daily_rh_trend_max, \
    daily_rh_slope_avg_per_year, daily_rh_result_avg, \
    daily_rh_slope_min_per_year, daily_rh_result_min, \
    daily_rh_slope_max_per_year, daily_rh_result_max, \
    weekly_rh, \
    weekly_rh_trend_avg, weekly_rh_trend_min, weekly_rh_trend_max, \
    weekly_rh_slope_avg_per_year, weekly_rh_result_avg, \
    weekly_rh_slope_min_per_year, weekly_rh_result_min, \
    weekly_rh_slope_max_per_year, weekly_rh_result_max,\
    monthly_rh, \
    monthly_rh_trend_avg, monthly_rh_trend_min, monthly_rh_trend_max, \
    monthly_rh_slope_avg_per_year, monthly_rh_result_avg, \
    monthly_rh_slope_min_per_year, monthly_rh_result_min, \
    monthly_rh_slope_max_per_year, monthly_rh_result_max, \
    bad_days, bad_weeks, bad_months, \
    valid_years

In [None]:
# Function writing a new .csv in your new folder

def write_csv(basename, dataframe):

    # Creating new file path and name
    new_file_path = files_name + '/PROCESSED_' + os.path.basename(basename)
    new_file_path = new_file_path.replace('txt', 'csv')
    
    # Prints selected file name
    print('Writing new file:', os.path.basename(new_file_path), '\n')
    
    # Renaming labels to standard
    dataframe.rename(columns={time_name: std_time_name,
                              temp_name: std_temp_name,
                              rh_name: std_rh_name,
                              td_name: std_td_name}, inplace=True)
    
    # Removing custom index
    dataframe.reset_index(drop=True, inplace=True)
    
    # Writing the new dataframe to your computer
    dataframe.to_csv(new_file_path, index=False, encoding='utf-8-sig')

In [None]:
# Window selector function

def find_valid_windows(dataframe, column_subset=None, min_window_size=1):

    print('Calculating Data Window(s)\n')

    print('The following time windows contain data, please choose a start and end date for visualization (must be on the hour).\n',
          'You can input \'all\' to visualize everything or eg. \'2003/10/17 21:00:00\'.\n')
    
    # Step 1: Ensure datetime index and sort it
    dataframe = dataframe.set_index(time_index, drop=True)
    dataframe.index = pd.to_datetime(dataframe.index).strftime(std_date_format)
    dataframe.index = pd.DatetimeIndex(dataframe.index)
    
    # Step 2: Compute time differences
    time_diffs = dataframe.index.to_series().diff()
    
    # Step 3: Identify breaks (difference not equal to 1 hour)
    one_hour = pd.Timedelta(hours=1)
    breaks = time_diffs != one_hour
    
    # Step 4: Assign group IDs
    group_id = breaks.cumsum()
    
    # Step 5: Group by ID and extract start/end of each group
    sequences = dataframe.groupby(group_id).apply(lambda g: (g.index[0], g.index[-1]))
    
    # Step 6: Convert to list of (start, end) tuples
    continuous_sequences = sequences.tolist()
    
    # Output the sequences
    ticker = 0
    
    for start, end in continuous_sequences:
        if pd.isnull(start):
            continue
        else:
            #print(f"Start: {start}, End: {end}")
            ticker = ticker + 1
            print('Window', ticker, ': ', start, 'until', end, '\t')
    
    return

In [None]:
# Hourly plotting fuctions

def plot_hourly_temp(filename, data, window_start, window_end):

    print('\nPlotting: Hourly temperature.\n')

    data[std_time_name] = pd.to_datetime(data[std_time_name])
    
    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot hourly
    plt.plot(
        data[std_time_name],
        data[std_temp_name],
        label='Temp', 
        color='black', 
        linewidth=2.5
    )
    
    plt.title('Hourly ' + std_temp_name + ' ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hour_temp.png'))

def plot_hourly_rh(filename, data, window_start, window_end):
    
    print('\nPlotting: Hourly relative humidity.\n')

    data[std_time_name] = pd.to_datetime(data[std_time_name])

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot hourly
    plt.plot(
        data[std_time_name],
        data[rh_cor_name],
        label='Temp', 
        color='black', 
        linewidth=2.5
    )
    
    plt.title('Hourly ' + std_rh_name + ' ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hour_temp.png'))

In [None]:
# Daily plotting functions

### DAILY TEMPERATURE PLOT

def plot_daily_temp(filename, 
                    daily_temp, 
                    daily_temp_trend_avg, daily_temp_trend_min, daily_temp_trend_max, 
                    daily_temp_slope_avg_per_year, daily_temp_result_avg,
                    daily_temp_slope_min_per_year, daily_temp_result_min,
                    daily_temp_slope_max_per_year, daily_temp_result_max,
                    bad_days, 
                    window_start, window_end
                   ):

    print('\nPlotting: Daily average, minimum, and maximum temperatures.\n')

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot daily average, min, and max
    plt.plot(daily_temp['day'], 
             daily_temp['mean'], 
             label='Average Temperature',
             color='black',
             linewidth=2
            )
    
    plt.plot(daily_temp['day'], 
             daily_temp['min'], 
             label='Minimum Temperature', 
             color='blue', 
             linewidth=0.5
            )
    
    plt.plot(daily_temp['day'], 
             daily_temp['max'], 
             label='Maximum Temperature', 
             color='red', 
             linewidth=0.5
            )

    # Plot daily trend average, min, and max
    plt.plot(daily_temp[std_time_name], 
             daily_temp_trend_avg, 
             label=f'Daily Trend Avg ({daily_temp_slope_avg_per_year:.2f}°C/yr, R²={daily_temp_result_avg.rvalue**2:.2f})', 
             color='black', 
             linestyle='dashed'
            )
    
    plt.plot(daily_temp[std_time_name], 
             daily_temp_trend_min, label=f'Daily Trend Min ({daily_temp_slope_min_per_year:.2f}°C/yr, R²={daily_temp_result_min.rvalue**2:.2f})', 
             color='blue', 
             linestyle='dashed'
            )
    
    plt.plot(daily_temp[std_time_name], 
             daily_temp_trend_max, label=f'Daily Trend Max ({daily_temp_slope_max_per_year:.2f}°C/yr, R²={daily_temp_result_max.rvalue**2:.2f})', 
             color='red', 
             linestyle='dashed'
            )
    
    plt.title('Daily ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_days:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)

    fig1 = plt.gcf()
    
    legend_elements = [
        Line2D([0], [0], color='black', linewidth=2, label='Average Temp'),
        Line2D([0], [0], color='black', linestyle='dashed', label=f'Trend Avg ({daily_temp_slope_avg_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='blue', linewidth=0.5, label='Minimum Temp'),
        Line2D([0], [0], color='blue', linestyle='dashed', label=f'Trend Min ({daily_temp_slope_min_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='red', linewidth=0.5, label='Maximum Temp'),
        Line2D([0], [0], color='red', linestyle='dashed', label=f'Trend Max ({daily_temp_slope_max_per_year:.2f} °C/yr)'),
    ]

    plt.legend(
        handles=legend_elements,
        loc='upper center',
        bbox_to_anchor=(0.5, -0.2),
        ncol=3,
        frameon=False
    )
    
    plt.subplots_adjust(bottom=0.35)

    plt.show()

    fig1.set_size_inches(12, 6)

    print(f"Average Temp: slope={daily_temp_slope_avg_per_year:.6f} °C/yr, R²={daily_temp_result_avg.rvalue**2:.4f}, p={daily_temp_result_avg.pvalue:.4g}, stderr={daily_temp_result_avg.stderr:.4f}")
    print(f"Min Temp:     slope={daily_temp_slope_min_per_year:.6f} °C/yr, R²={daily_temp_result_min.rvalue**2:.4f}, p={daily_temp_result_min.pvalue:.4g}, stderr={daily_temp_result_min.stderr:.4f}")
    print(f"Max Temp:     slope={daily_temp_slope_max_per_year:.6f} °C/yr, R²={daily_temp_result_max.rvalue**2:.4f}, p={daily_temp_result_max.pvalue:.4g}, stderr={daily_temp_result_max.stderr:.4f}") 

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'day_temp.png'))


### DAILY RELATIVE HUMIDITY PLOT

def plot_daily_rh(filename, 
                  daily_rh,
                  daily_rh_trend_avg, daily_rh_trend_min, daily_rh_trend_max, 
                  daily_rh_slope_avg_per_year, daily_rh_result_avg,
                  daily_rh_slope_min_per_year, daily_rh_result_min,
                  daily_rh_slope_max_per_year, daily_rh_result_max,
                  bad_days, 
                  window_start, window_end
                 ):

    print('\nPlotting: Daily average, minimum, and maximum relative humidity.\n')

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot daily average, min, and max
    plt.plot(daily_rh['day'], 
             daily_rh['mean'], 
             label='Average RH', 
             color='black', 
             linewidth=2
            )

    plt.plot(daily_rh['day'], 
             daily_rh['min'], 
             label='Minimum RH', 
             color='blue', 
             linewidth=0.5
            )

    plt.plot(daily_rh['day'], 
             daily_rh['max'], 
             label='Maximum RH', 
             color='red', 
             linewidth=0.5
            )

    # Plot daily trend avg, min, and max
    plt.plot(daily_rh[std_time_name], 
             daily_rh_trend_avg, 
             label=f'Daily Trend Avg ({daily_rh_slope_avg_per_year:.2f} %/yr, R²={daily_rh_result_avg.rvalue**2:.2f})', 
             color='black', 
             linestyle='dashed'
            )
    
    plt.plot(daily_rh[std_time_name], 
             daily_rh_trend_min, 
             label=f'Daily Trend Min ({daily_rh_slope_min_per_year:.2f} %/yr, R²={daily_rh_result_min.rvalue**2:.2f})', 
             color='blue', 
             linestyle='dashed'
            )
    
    plt.plot(daily_rh[std_time_name], 
             daily_rh_trend_max, 
             label=f'Daily Trend Max ({daily_rh_slope_max_per_year:.2f} %/yr, R²={daily_rh_result_max.rvalue**2:.2f})', 
             color='red', 
             linestyle='dashed'
            )


    plt.title('Daily ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_days:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)

    fig2 = plt.gcf()
    
    legend_elements = [
        Line2D([0], [0], color='black', linewidth=2, label='Average RH'),
        Line2D([0], [0], color='black', linestyle='dashed', label=f'Trend Avg ({daily_rh_slope_avg_per_year:.2f} %/yr)'),
        Line2D([0], [0], color='blue', linewidth=0.5, label='Minimum RH'),
        Line2D([0], [0], color='blue', linestyle='dashed', label=f'Trend Min ({daily_rh_slope_avg_per_year:.2f} %/yr)'),
        Line2D([0], [0], color='red', linewidth=0.5, label='Maximum RH'),
        Line2D([0], [0], color='red', linestyle='dashed', label=f'Trend Max ({daily_rh_slope_avg_per_year:.2f} %/yr)'),
    ]
    
    plt.legend(
        handles=legend_elements,
        loc='upper center',
        bbox_to_anchor=(0.5, -0.2),
        ncol=3,
        frameon=False
    )
    
    plt.subplots_adjust(bottom=0.35)  # more space for two lines
    
    plt.show()

    fig2.set_size_inches(12, 6)

    print(f"Average RH: slope={daily_rh_slope_avg_per_year:.6f} %/yr, R²={daily_rh_result_avg.rvalue**2:.4f}, p={daily_rh_result_avg.pvalue:.4g}, stderr={daily_rh_result_avg.stderr:.4f}")
    print(f"Min RH:     slope={daily_rh_slope_avg_per_year:.6f} %/yr, R²={daily_rh_result_min.rvalue**2:.4f}, p={daily_rh_result_min.pvalue:.4g}, stderr={daily_rh_result_min.stderr:.4f}")
    print(f"Max RH:     slope={daily_rh_slope_avg_per_year:.6f} %/yr, R²={daily_rh_result_max.rvalue**2:.4f}, p={daily_rh_result_max.pvalue:.4g}, stderr={daily_rh_result_max.stderr:.4f}")

    fig2.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'day_rh.png'))

In [None]:
# Weekly plotting functions

### WEEKLY TEMPERATURE PLOT

def plot_weekly_temp(filename, 
                     weekly_temp, 
                     weekly_temp_trend_avg, weekly_temp_trend_min, weekly_temp_trend_max, 
                     weekly_temp_slope_avg_per_year, weekly_temp_result_avg,
                     weekly_temp_slope_min_per_year, weekly_temp_result_min,
                     weekly_temp_slope_max_per_year, weekly_temp_result_max,
                     bad_weeks, 
                     window_start, window_end
                    ):

    print('\nPlotting: Weekly average, minimum, and maximum temperatures.\n')

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot weekly average, min, and max
    plt.plot(weekly_temp['week'], 
             weekly_temp['mean'], 
             label='Average Temperature', 
             color='black', 
             linewidth=2
            )
    
    plt.plot(weekly_temp['week'], 
             weekly_temp['min'], 
             label='Minimum Temperature', 
             color='blue', 
             linewidth=0.5
            )
    
    plt.plot(weekly_temp['week'], 
             weekly_temp['max'], 
             label='Maximum Temperature', 
             color='red', 
             linewidth=0.5
            )

    # Plot weekly trend average, min, and max
    plt.plot(weekly_temp[std_time_name], 
             weekly_temp_trend_avg, 
             label=f'Weekly Trend Avg ({weekly_temp_slope_avg_per_year:.2f}°C/yr)', 
             color='black', 
             linestyle='dotted'
            )
    
    plt.plot(weekly_temp[std_time_name], 
             weekly_temp_trend_min, 
             label=f'Weekly Trend Min ({weekly_temp_slope_min_per_year:.2f}°C/yr)', 
             color='blue', 
             linestyle='dotted'
            )
    
    plt.plot(weekly_temp[std_time_name], 
             weekly_temp_trend_max, 
             label=f'Weekly Trend Max ({weekly_temp_slope_max_per_year:.2f}°C/yr)', 
             color='red', 
             linestyle='dotted'
            )
    
    plt.title('Weekly ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_weeks:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    legend_elements = [
        Line2D([0], [0], color='black', linewidth=2, label='Average Temp'),
        Line2D([0], [0], color='black', linestyle='dashed', label=f'Trend Avg ({weekly_temp_slope_avg_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='blue', linewidth=0.5, label='Minimum Temp'),
        Line2D([0], [0], color='blue', linestyle='dashed', label=f'Trend Min ({weekly_temp_slope_min_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='red', linewidth=0.5, label='Maximum Temp'),
        Line2D([0], [0], color='red', linestyle='dashed', label=f'Trend Max ({weekly_temp_slope_max_per_year:.2f} °C/yr)'),
    ]
    
    plt.legend(
        handles=legend_elements,
        loc='upper center',
        bbox_to_anchor=(0.5, -0.2),
        ncol=3,
        frameon=False
    )
    
    plt.subplots_adjust(bottom=0.35)
    
    plt.show()

    fig1.set_size_inches(12, 6)

    print(f"Average Temp: slope={weekly_temp_slope_avg_per_year:.6f} °C/yr, R²={weekly_temp_result_avg.rvalue**2:.4f}, p={weekly_temp_result_avg.pvalue:.4g}, stderr={weekly_temp_result_avg.stderr:.4f}")
    print(f"Min Temp:     slope={weekly_temp_slope_min_per_year:.6f} °C/yr, R²={weekly_temp_result_min.rvalue**2:.4f}, p={weekly_temp_result_min.pvalue:.4g}, stderr={weekly_temp_result_min.stderr:.4f}")
    print(f"Max Temp:     slope={weekly_temp_slope_max_per_year:.6f} °C/yr, R²={weekly_temp_result_max.rvalue**2:.4f}, p={weekly_temp_result_max.pvalue:.4g}, stderr={weekly_temp_result_max.stderr:.4f}")

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'week_temp.png'))


### WEEKLY RELATIVE HUMIDITY PLOT

def plot_weekly_rh(filename, 
                   weekly_rh, 
                   weekly_rh_trend_avg, weekly_rh_trend_min, weekly_rh_trend_max, 
                   weekly_rh_slope_avg_per_year, weekly_rh_result_avg,
                   weekly_rh_slope_min_per_year, weekly_rh_result_min,
                   weekly_rh_slope_max_per_year, weekly_rh_result_max,
                   bad_weeks, 
                   window_start, window_end
                   ):

    print('\nPlotting: Weekly average, minimum, and maximum relative humidity.\n')

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot weekly average, min, and max RH
    plt.plot(weekly_rh['week'], 
             weekly_rh['mean'], 
             label='Average RH', 
             color='black', 
             linewidth=2
            )

    plt.plot(weekly_rh['week'], 
             weekly_rh['min'], 
             label='Minimum RH', 
             color='blue', 
             linewidth=0.5
            )

    plt.plot(weekly_rh['week'], 
             weekly_rh['max'], 
             label='Maximum RH', 
             color='red', 
             linewidth=0.5
            )

    # Plot weekly trend average, min, and max
    plt.plot(weekly_rh[std_time_name], 
             weekly_rh_trend_avg, 
             label=f'Weekly Trend Avg ({weekly_rh_slope_avg_per_year:.2f}°C/yr)', 
             color='black', 
             linestyle='dotted'
            )
    
    plt.plot(weekly_rh[std_time_name], 
             weekly_rh_trend_min, 
             label=f'Weekly Trend Min ({weekly_rh_slope_min_per_year:.2f}°C/yr)', 
             color='blue', 
             linestyle='dotted'
            )
    
    plt.plot(weekly_rh[std_time_name], 
             weekly_rh_trend_max, 
             label=f'Weekly Trend Max ({weekly_rh_slope_max_per_year:.2f}°C/yr)', 
             color='red', 
             linestyle='dotted'
            )

    plt.title('Weekly ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_weeks:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig2 = plt.gcf()
    
    legend_elements = [
        Line2D([0], [0], color='black', linewidth=2, label='Average RH'),
        Line2D([0], [0], color='black', linestyle='dashed', label=f'Trend Avg ({weekly_rh_slope_avg_per_year:.2f} %/yr)'),
        Line2D([0], [0], color='blue', linewidth=0.5, label='Minimum RH'),
        Line2D([0], [0], color='blue', linestyle='dashed', label=f'Trend Min ({weekly_rh_slope_min_per_year:.2f} %/yr)'),
        Line2D([0], [0], color='red', linewidth=0.5, label='Maximum RH'),
        Line2D([0], [0], color='red', linestyle='dashed', label=f'Trend Max ({weekly_rh_slope_max_per_year:.2f} %/yr)'),
    ]
    
    plt.legend(
        handles=legend_elements,
        loc='upper center',
        bbox_to_anchor=(0.5, -0.2),
        ncol=3,
        frameon=False
    )

    plt.subplots_adjust(bottom=0.35)

    plt.show()

    fig2.set_size_inches(12, 6)

    print(f"Average RH: slope={weekly_rh_slope_avg_per_year:.6f} %/yr, R²={weekly_rh_result_avg.rvalue**2:.4f}, p={weekly_rh_result_avg.pvalue:.4g}, stderr={weekly_rh_result_avg.stderr:.4f}")
    print(f"Min RH:     slope={weekly_rh_slope_min_per_year:.6f} %/yr, R²={weekly_rh_result_min.rvalue**2:.4f}, p={weekly_rh_result_min.pvalue:.4g}, stderr={weekly_rh_result_min.stderr:.4f}")
    print(f"Max RH:     slope={weekly_rh_slope_max_per_year:.6f} %/yr, R²={weekly_rh_result_max.rvalue**2:.4f}, p={weekly_rh_result_max.pvalue:.4g}, stderr={weekly_rh_result_max.stderr:.4f}")

    fig2.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'week_rh.png'))

In [None]:
# Monthly plotting functions

### MONTHLY TEMPERATURE PLOT
def plot_monthly_temp(filename, 
                      monthly_temp, 
                      monthly_temp_trend_avg, monthly_temp_trend_min, monthly_temp_trend_max, 
                      monthly_temp_slope_avg_per_year, monthly_temp_result_avg,
                      monthly_temp_slope_min_per_year, monthly_temp_result_min,
                      monthly_temp_slope_max_per_year, monthly_temp_result_max,
                      bad_weeks, 
                      window_start, window_end
                     ):
 
    print('\nPlotting: Monthly average, minimum, and maximum temperatures.\n')

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max
    plt.plot(monthly_temp['month'], 
             monthly_temp['mean'], 
             label='Avg Temp', 
             color='black', 
             linewidth=2
            )
    
    plt.plot(monthly_temp['month'], 
             monthly_temp['min'], 
             label='Min Temp', 
             color='blue', 
             linewidth=0.5
            )
    
    plt.plot(monthly_temp['month'], 
             monthly_temp['max'], 
             label='Max Temp', 
             color='red', 
             linewidth=0.5
            )

    # Plot monthly trend average, min, and max
    plt.plot(monthly_temp[std_time_name], 
             monthly_temp_trend_avg, '--', 
             label=f'Trend Avg ({monthly_temp_slope_avg_per_year:.2f}°C/yr)', 
             color='black'
            )
    
    plt.plot(monthly_temp[std_time_name], 
             monthly_temp_trend_min, '--', 
             label=f'Trend Min ({monthly_temp_slope_min_per_year:.2f}°C/yr)', 
             color='blue'
            )
    
    plt.plot(monthly_temp[std_time_name], 
             monthly_temp_trend_max, '--', 
             label=f'Trend Max ({monthly_temp_slope_max_per_year:.2f}°C/yr)', 
             color='red'
            )

    plt.title('Monthly ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    legend_elements = [
        Line2D([0], [0], color='black', linewidth=2, label='Average Temp'),
        Line2D([0], [0], color='black', linestyle='dashed', label=f'Trend Avg ({monthly_temp_slope_avg_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='blue', linewidth=0.5, label='Minimum Temp'),
        Line2D([0], [0], color='blue', linestyle='dashed', label=f'Trend Min ({monthly_temp_slope_min_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='red', linewidth=0.5, label='Maximum Temp'),
        Line2D([0], [0], color='red', linestyle='dashed', label=f'Trend Max ({monthly_temp_slope_max_per_year:.2f} °C/yr)'),
    ]
    
    plt.legend(
        handles=legend_elements,
        loc='upper center',
        bbox_to_anchor=(0.5, -0.2),
        ncol=3,
        frameon=False
    )
    
    plt.subplots_adjust(bottom=0.35)
    
    plt.show()

    fig1.set_size_inches(12, 6)

    print(f"Average Temp: slope={monthly_temp_slope_avg_per_year:.6f} °C/yr, R²={monthly_temp_result_avg.rvalue**2:.4f}, p={monthly_temp_result_avg.pvalue:.4g}, stderr={monthly_temp_result_avg.stderr:.4f}")
    print(f"Min Temp:     slope={monthly_temp_slope_min_per_year:.6f} °C/yr, R²={monthly_temp_result_min.rvalue**2:.4f}, p={monthly_temp_result_min.pvalue:.4g}, stderr={monthly_temp_result_min.stderr:.4f}")
    print(f"Max Temp:     slope={monthly_temp_slope_max_per_year:.6f} °C/yr, R²={monthly_temp_result_max.rvalue**2:.4f}, p={monthly_temp_result_max.pvalue:.4g}, stderr={monthly_temp_result_max.stderr:.4f}")

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_temp.png'))


### MONTHLY RELATIVE HUMIDITY PLOT
def plot_monthly_rh(filename, 
                      monthly_rh, 
                      monthly_rh_trend_avg, monthly_rh_trend_min, monthly_rh_trend_max, 
                      monthly_rh_slope_avg_per_year, monthly_rh_result_avg,
                      monthly_rh_slope_min_per_year, monthly_rh_result_min,
                      monthly_rh_slope_max_per_year, monthly_rh_result_max,
                      bad_weeks, 
                      window_start, window_end
                     ):
 
    print('\nPlotting: Monthly average, minimum, and maximum relative humidity.\n')

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max
    plt.plot(monthly_rh['month'], 
             monthly_rh['mean'], 
             label='Avg RH', 
             color='black', 
             linewidth=2
            )
    
    plt.plot(monthly_rh['month'], 
             monthly_rh['min'], 
             label='Min RH', 
             color='blue', 
             linewidth=0.5
            )
    
    plt.plot(monthly_rh['month'], 
             monthly_rh['max'], 
             label='Max RH', 
             color='red', 
             linewidth=0.5
            )

    # Plot monthly trend average, min, and max
    plt.plot(monthly_rh[std_time_name], 
             monthly_rh_trend_avg, '--', 
             label=f'Trend Avg ({monthly_rh_slope_avg_per_year:.2f}°C/yr)', 
             color='black'
            )
    
    plt.plot(monthly_rh[std_time_name], 
             monthly_rh_trend_min, '--', 
             label=f'Trend Min ({monthly_rh_slope_min_per_year:.2f}°C/yr)', 
             color='blue'
            )
    
    plt.plot(monthly_rh[std_time_name], 
             monthly_rh_trend_max, '--', 
             label=f'Trend Max ({monthly_rh_slope_max_per_year:.2f}°C/yr)', 
             color='red'
            )

    plt.title('Monthly ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    legend_elements = [
        Line2D([0], [0], color='black', linewidth=2, label='Average RH'),
        Line2D([0], [0], color='black', linestyle='dashed', label=f'Trend Avg ({monthly_rh_slope_avg_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='blue', linewidth=0.5, label='Minimum RH'),
        Line2D([0], [0], color='blue', linestyle='dashed', label=f'Trend Min ({monthly_rh_slope_min_per_year:.2f} °C/yr)'),
        Line2D([0], [0], color='red', linewidth=0.5, label='Maximum RH'),
        Line2D([0], [0], color='red', linestyle='dashed', label=f'Trend Max ({monthly_rh_slope_max_per_year:.2f} °C/yr)'),
    ]
    
    plt.legend(
        handles=legend_elements,
        loc='upper center',
        bbox_to_anchor=(0.5, -0.2),
        ncol=3,
        frameon=False
    )
    
    plt.subplots_adjust(bottom=0.35)
    
    plt.show()

    fig1.set_size_inches(12, 6)

    print(f"Average RH: slope={monthly_rh_slope_avg_per_year:.6f} °C/yr, R²={monthly_rh_result_avg.rvalue**2:.4f}, p={monthly_rh_result_avg.pvalue:.4g}, stderr={monthly_rh_result_avg.stderr:.4f}")
    print(f"Min RH:     slope={monthly_rh_slope_min_per_year:.6f} °C/yr, R²={monthly_rh_result_min.rvalue**2:.4f}, p={monthly_rh_result_min.pvalue:.4g}, stderr={monthly_rh_result_min.stderr:.4f}")
    print(f"Max RH:     slope={monthly_rh_slope_max_per_year:.6f} °C/yr, R²={monthly_rh_result_max.rvalue**2:.4f}, p={monthly_rh_result_max.pvalue:.4g}, stderr={monthly_rh_result_max.stderr:.4f}")

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_rh.png'))

In [None]:
# Histogram

## TEMPERATURE HISTOGRAM

def plot_hist_temp(filename, data, window_start, window_end):

    print('\nPlotting: Temperature Histogram.\n')
    
    plt.figure(figsize=(12,6))
    
    hist_temp_df=data.loc[window_start : window_end]
    
    # Get min and max values of the temperature column
    min_val = int(np.floor(data[std_temp_name].min()))
    max_val = int(np.ceil(data[std_temp_name].max()))
    # Create integer bins
    bins = np.arange(min_val, max_val + 1)
    
    plt.hist(hist_temp_df[std_temp_name], density=False, bins=bins)
    
    plt.title('Temperature Histogram')
    plt.xlabel('Temperature')
    plt.ylabel('# occurences')
    
    plt.xticks(bins)
    plt.grid(True, which='major', axis='x')  
    plt.grid(True, which='major', axis='y')  
    
    plt.show()

    fig1 = plt.gcf()

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hist_temp.png'))

## RELATIVE HUMIDITY HISTOGRAM

def plot_hist_rh(filename, data, window_start, window_end):

    print('\nPlotting: Relative Humidity Histogram.\n')
    
    plt.figure(figsize=(12,6))
    
    hist_temp_df=data.loc[window_start : window_end]
    
    # Get min and max values of the temperature column
    min_val = int(np.floor(data[std_rh_name].min()))
    max_val = int(np.ceil(data[std_rh_name].max()))
    # Create integer bins
    bins = np.arange(min_val, max_val + 1)
    
    plt.hist(hist_temp_df[std_rh_name], density=False, bins=bins)
    
    plt.title('Relative Humidity Histogram')
    plt.xlabel('Temperature')
    plt.ylabel('# occurences')
    
    plt.xticks(bins)
    plt.grid(True, which='major', axis='x')  
    plt.grid(True, which='major', axis='y')  
    
    plt.show()

    fig1 = plt.gcf()

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hist_rh.png'))

In [None]:
# Histogram per year

## TEMPERATURE HISTOGRAM PER YEAR

def plot_temp_histogram_per_year(filename, dataframe, valid_years):

    print('\nPlotting: Temperature Histogram per Year.\n')
    
    plt.figure(figsize=(12, 6))

    min_val = int(np.floor(dataframe[std_temp_name].min()))
    max_val = int(np.ceil(dataframe[std_temp_name].max()))
    bins = np.arange(min_val, max_val + 1)

    for year in valid_years:
        year_df = dataframe[dataframe['year'] == year]
        hist, _ = np.histogram(year_df[std_temp_name], bins=bins)
        plt.plot(bins[:-1], hist, label=str(year), alpha=0.7)

    plt.title('Temperature Distribution by Year (Valid Years Only)')
    plt.xlabel('Temperature (°C)')
    plt.ylabel('Occurrences')
    plt.xticks(bins)
    plt.grid(True, which='major', axis='both')
    plt.legend(loc='upper right', frameon=False)
    plt.tight_layout()
    plt.show()

    fig1 = plt.gcf()

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hist_perday_temp.png'))


## RELATIVE HUMIDITY HISTOGRAM PER YEAR

def plot_rh_histogram_per_year(filename, dataframe, valid_years):

    print('\nPlotting: Relative Humidity Histogram per Year.\n')
    
    plt.figure(figsize=(12, 6))

    min_val = int(np.floor(dataframe[std_rh_name].min()))
    max_val = int(np.ceil(dataframe[std_rh_name].max()))
    bins = np.arange(min_val, max_val + 1)

    for year in valid_years:
        year_df = dataframe[dataframe['year'] == year]
        hist, _ = np.histogram(year_df[std_rh_name], bins=bins)
        plt.plot(bins[:-1], hist, label=str(year), alpha=0.7)

    plt.title('Temperature Distribution by Year (Valid Years Only)')
    plt.xlabel('Temperature (°C)')
    plt.ylabel('Occurrences')
    plt.xticks(bins)
    plt.grid(True, which='major', axis='both')
    plt.legend(loc='upper right', frameon=False)
    plt.tight_layout()
    plt.show()

    fig1 = plt.gcf()

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hist_perday_rh.png'))

In [None]:
# Hourly Average Day and Night

def plot_avg_day_night_temp_all():
    df = dataframe.copy()

    # Filter to date range
    df = df.loc[start_date_window:end_date_window]
    df['hour'] = df.index.hour
    df['date'] = df.index.date
    
    # For monthly graphs
    df.loc[df.index.to_period('M').isin(bad_months), temp_name] = np.nan
    
    # For weekly graphs
    df.loc[df.index.to_period('W').isin(bad_weeks), temp_name] = np.nan
    
    # For daily graphs
    df.loc[df.index.to_period('D').isin(bad_days), temp_name] = np.nan

    # Classify as night or day
    df['time_of_day'] = np.where(df['hour'].isin(night_range), 'night', 'day')

    # ---- DAILY AVERAGE ----
    daily = df.groupby(['date', 'time_of_day'])[temp_name].mean().unstack()

    # ---- WEEKLY AVERAGE ----
    df['week'] = df.index.to_period('W').start_time
    weekly = df.groupby(['week', 'time_of_day'])[temp_name].mean().unstack()

    # ---- MONTHLY AVERAGE ----
    df['month'] = df.index.to_period('M').start_time
    monthly = df.groupby(['month', 'time_of_day'])[temp_name].mean().unstack()

    # ---- PLOTTING ----
    fig, axes = plt.subplots(3, 1, figsize=(14, 14), sharey=True)

    # Daily
    axes[0].plot(daily.index, daily['day'], label='Day Avg', color='orange')
    axes[0].plot(daily.index, daily['night'], label='Night Avg', color='blue')
    axes[0].set_title('Daily Average Day vs Night Temperature')
    axes[0].legend()
    axes[0].grid(True)
    for period in bad_days:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        axes[0].axvspan(start, end, color='gray', alpha=0.2)
    
    # Weekly
    axes[1].plot(weekly.index, weekly['day'], label='Day Avg', color='orange')
    axes[1].plot(weekly.index, weekly['night'], label='Night Avg', color='blue')
    axes[1].set_title('Weekly Average Day vs Night Temperature')
    axes[1].legend()
    axes[1].grid(True)
    for period in bad_weeks:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        axes[1].axvspan(start, end, color='gray', alpha=0.2)
    
    # Monthly
    axes[2].plot(monthly.index, monthly['day'], label='Day Avg', color='orange')
    axes[2].plot(monthly.index, monthly['night'], label='Night Avg', color='blue')
    axes[2].set_title('Monthly Average Day vs Night Temperature')
    axes[2].legend()
    axes[2].grid(True)
    for period in bad_months:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        axes[2].axvspan(start, end, color='gray', alpha=0.2)

In [None]:
# Daily Average Day and Night

def plot_avg_day_night_temp_daily():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['hour'] = df.index.hour
    df['date'] = df.index.date
    df.loc[df.index.to_period('D').isin(bad_days), temp_name] = np.nan
    df['time_of_day'] = np.where(df['hour'].isin(night_range), 'night', 'day')

    daily = df.groupby(['date', 'time_of_day'])[temp_name].mean().unstack()

    plt.figure(figsize=(14,6))
    plt.plot(daily.index, daily['day'], label='Day Avg', color='orange')
    plt.plot(daily.index, daily['night'], label='Night Avg', color='blue')
    plt.title('Daily Average Day vs Night Temperature')
    plt.xlabel('Date')
    plt.ylabel('Temperature (°C)')
    plt.legend()
    ax = plt.gca() 
    for period in bad_days:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Weekly Average Day and Night

def plot_avg_day_night_temp_weekly():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['hour'] = df.index.hour
    df['week'] = df.index.to_period('W').start_time
    df.loc[df.index.to_period('W').isin(bad_weeks), temp_name] = np.nan
    df['time_of_day'] = np.where(df['hour'].isin(night_range), 'night', 'day')

    weekly = df.groupby(['week', 'time_of_day'])[temp_name].mean().unstack()

    plt.figure(figsize=(14,6))
    plt.plot(weekly.index, weekly['day'], label='Day Avg', color='orange')
    plt.plot(weekly.index, weekly['night'], label='Night Avg', color='blue')
    plt.title('Weekly Average Day vs Night Temperature')
    plt.xlabel('Week')
    plt.ylabel('Temperature (°C)')
    plt.legend()
    ax = plt.gca() 
    for period in bad_weeks:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Monthly Average Day and Night

def plot_avg_day_night_temp_monthly():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['hour'] = df.index.hour
    df['month'] = df.index.to_period('M').start_time
    df.loc[df.index.to_period('M').isin(bad_months), temp_name] = np.nan
    df['time_of_day'] = np.where(df['hour'].isin(night_range), 'night', 'day')

    monthly = df.groupby(['month', 'time_of_day'])[temp_name].mean().unstack()

    plt.figure(figsize=(14,6))
    plt.plot(monthly.index, monthly['day'], label='Day Avg', color='orange')
    plt.plot(monthly.index, monthly['night'], label='Night Avg', color='blue')
    plt.title('Monthly Average Day vs Night Temperature')
    plt.xlabel('Month')
    plt.ylabel('Temperature (°C)')
    plt.legend()
    ax = plt.gca() 
    for period in bad_months:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Daily Day and Night Difference

def plot_daily_day_night_temp_difference():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['hour'] = df.index.hour
    df['date'] = df.index.date
    df['time_of_day'] = np.where(df['hour'].isin(night_range), 'night', 'day')

    daily = df.groupby(['date', 'time_of_day'])[temp_name].mean().unstack()
    daily_diff = daily['day'] - daily['night']

    plt.figure(figsize=(14,6))
    plt.plot(daily_diff.index, daily_diff, color='purple')
    plt.title('Daily Average Day-Night Temperature Difference')
    plt.xlabel('Date')
    plt.ylabel('Temperature Difference (°C)')
    plt.grid(True)

    ax = plt.gca()
    for period in bad_days:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)

    plt.tight_layout()
    plt.show()

In [None]:
# Weekly Day and Night Difference

def plot_weekly_day_night_temp_difference():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['hour'] = df.index.hour
    df['week'] = df.index.to_period('W').start_time
    df['time_of_day'] = np.where(df['hour'].isin(night_range), 'night', 'day')

    weekly = df.groupby(['week', 'time_of_day'])[temp_name].mean().unstack()
    weekly_diff = weekly['day'] - weekly['night']

    plt.figure(figsize=(14,6))
    plt.plot(weekly_diff.index, weekly_diff, color='purple')
    plt.title('Weekly Average Day-Night Temperature Difference')
    plt.xlabel('Week')
    plt.ylabel('Temperature Difference (°C)')
    plt.grid(True)

    ax = plt.gca()
    for period in bad_weeks:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)

    plt.tight_layout()
    plt.show()

In [None]:
# Monthly Day and Night Difference

def plot_monthly_day_night_temp_difference():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['hour'] = df.index.hour
    df['month'] = df.index.to_period('M').start_time
    df['time_of_day'] = np.where(df['hour'].isin(night_range), 'night', 'day')

    monthly = df.groupby(['month', 'time_of_day'])[temp_name].mean().unstack()
    monthly_diff = monthly['day'] - monthly['night']

    plt.figure(figsize=(14,6))
    plt.plot(monthly_diff.index, monthly_diff, color='purple')
    plt.title('Monthly Average Day-Night Temperature Difference')
    plt.xlabel('Month')
    plt.ylabel('Temperature Difference (°C)')
    plt.grid(True)

    ax = plt.gca()
    for period in bad_months:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.2)

    plt.tight_layout()
    plt.show()

In [None]:
# Hourly Deviation from Mean

def plot_temp_deviation_all():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]

    df['date'] = df.index.date
    df['week'] = df.index.to_period('W').start_time
    df['month'] = df.index.to_period('M').start_time
    df.loc[df.index.to_period('M').isin(bad_months), temp_name] = np.nan
    df.loc[df.index.to_period('W').isin(bad_weeks), temp_name] = np.nan
    df.loc[df.index.to_period('D').isin(bad_days), temp_name] = np.nan
    # ---- DAILY ----
    daily_stats = df.groupby('date')[temp_name].agg(['mean', 'max', 'min'])
    daily_stats['diff_max'] = daily_stats['max'] - daily_stats['mean']
    daily_stats['diff_min'] = daily_stats['mean'] - daily_stats['min']

    # ---- WEEKLY ----
    weekly_stats = df.groupby('week')[temp_name].agg(['mean', 'max', 'min'])
    weekly_stats['diff_max'] = weekly_stats['max'] - weekly_stats['mean']
    weekly_stats['diff_min'] = weekly_stats['mean'] - weekly_stats['min']

    # ---- MONTHLY ----
    monthly_stats = df.groupby('month')[temp_name].agg(['mean', 'max', 'min'])
    monthly_stats['diff_max'] = monthly_stats['max'] - monthly_stats['mean']
    monthly_stats['diff_min'] = monthly_stats['mean'] - monthly_stats['min']

    # ---- PLOTTING ----
    fig, axes = plt.subplots(3, 1, figsize=(14, 14), sharey=False)

    # Daily
    axes[0].plot(daily_stats.index, daily_stats['diff_max'], label='Max - Avg', color='red')
    axes[0].plot(daily_stats.index, daily_stats['diff_min'], label='Avg - Min', color='blue')
    axes[0].set_title('Daily Temp Deviation (Max - Avg / Avg - Min)')
    axes[0].legend()
    axes[0].grid(True)
    if 'bad_days' in globals():
        for period in bad_days:
            start = period.to_timestamp()
            end = (period + 1).to_timestamp()
            axes[0].axvspan(start, end, color='gray', alpha=0.15)

    # Weekly
    axes[1].plot(weekly_stats.index, weekly_stats['diff_max'], label='Max - Avg', color='red')
    axes[1].plot(weekly_stats.index, weekly_stats['diff_min'], label='Avg - Min', color='blue')
    axes[1].set_title('Weekly Temp Deviation')
    axes[1].legend()
    axes[1].grid(True)
    if 'bad_weeks' in globals():
        for period in bad_weeks:
            start = period.to_timestamp()
            end = (period + 1).to_timestamp()
            axes[1].axvspan(start, end, color='gray', alpha=0.15)

    # Monthly
    axes[2].plot(monthly_stats.index, monthly_stats['diff_max'], label='Max - Avg', color='red')
    axes[2].plot(monthly_stats.index, monthly_stats['diff_min'], label='Avg - Min', color='blue')
    axes[2].set_title('Monthly Temp Deviation')
    axes[2].legend()
    axes[2].grid(True)
    if 'bad_months' in globals():
        for period in bad_months:
            start = period.to_timestamp()
            end = (period + 1).to_timestamp()
            axes[2].axvspan(start, end, color='gray', alpha=0.15)

    for ax in axes:
        ax.set_ylabel('Temperature Deviation (°C)')
        ax.set_xlabel('Date')

    plt.tight_layout()
    plt.show()

In [None]:
# Daily Deviation from Mean

def plot_temp_deviation_daily():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['date'] = df.index.date
    df.loc[df.index.to_period('D').isin(bad_days), temp_name] = np.nan
    daily_stats = df.groupby('date')[temp_name].agg(['mean', 'max', 'min'])
    daily_stats['diff_max'] = daily_stats['max'] - daily_stats['mean']
    daily_stats['diff_min'] = daily_stats['mean'] - daily_stats['min']
    
    plt.figure(figsize=(14, 5))
    plt.plot(daily_stats.index, daily_stats['diff_max'], label='Max - Avg', color='red')
    plt.plot(daily_stats.index, daily_stats['diff_min'], label='Avg - Min', color='blue')
    plt.title('Daily Temp Deviation (Max - Avg / Avg - Min)')
    plt.xlabel('Date')
    plt.ylabel('Temp Deviation (°C)')
    plt.legend()
    plt.grid(True)
    
    if 'bad_days' in globals():
        for period in bad_days:
            start = period.to_timestamp()
            end = (period + 1).to_timestamp()
            plt.axvspan(start, end, color='gray', alpha=0.15)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Weekly Deviation from Mean

def plot_temp_deviation_weekly():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['week'] = df.index.to_period('W').start_time
    df.loc[df.index.to_period('W').isin(bad_weeks), temp_name] = np.nan
    weekly_stats = df.groupby('week')[temp_name].agg(['mean', 'max', 'min'])
    weekly_stats['diff_max'] = weekly_stats['max'] - weekly_stats['mean']
    weekly_stats['diff_min'] = weekly_stats['mean'] - weekly_stats['min']

    plt.figure(figsize=(14, 5))
    plt.plot(weekly_stats.index, weekly_stats['diff_max'], label='Max - Avg', color='red')
    plt.plot(weekly_stats.index, weekly_stats['diff_min'], label='Avg - Min', color='blue')
    plt.title('Weekly Temp Deviation (Max - Avg / Avg - Min)')
    plt.xlabel('Date')
    plt.ylabel('Temp Deviation (°C)')
    plt.legend()
    plt.grid(True)

    if 'bad_weeks' in globals():
        for period in bad_weeks:
            start = period.to_timestamp()
            end = (period + 1).to_timestamp()
            plt.axvspan(start, end, color='gray', alpha=0.15)

    plt.tight_layout()
    plt.show()

In [None]:
# Monthly Devation from Mean

def plot_temp_deviation_monthly():
    df = dataframe.copy()
    df = df.loc[start_date_window:end_date_window]
    df['month'] = df.index.to_period('M').start_time
    df.loc[df.index.to_period('M').isin(bad_months), temp_name] = np.nan
    monthly_stats = df.groupby('month')[temp_name].agg(['mean', 'max', 'min'])
    monthly_stats['diff_max'] = monthly_stats['max'] - monthly_stats['mean']
    monthly_stats['diff_min'] = monthly_stats['mean'] - monthly_stats['min']

    plt.figure(figsize=(14, 5))
    plt.plot(monthly_stats.index, monthly_stats['diff_max'], label='Max - Avg', color='red')
    plt.plot(monthly_stats.index, monthly_stats['diff_min'], label='Avg - Min', color='blue')
    plt.title('Monthly Temp Deviation (Max - Avg / Avg - Min)')
    plt.xlabel('Date')
    plt.ylabel('Temp Deviation (°C)')
    plt.legend()
    plt.grid(True)

    if 'bad_months' in globals():
        for period in bad_months:
            start = period.to_timestamp()
            end = (period + 1).to_timestamp()
            plt.axvspan(start, end, color='gray', alpha=0.15)

    plt.tight_layout()
    plt.show()

In [None]:
# Walks through the folder and goes through each file one at a time.

directory = folder_path
counter = 0
completed = []
skipped = []

# Iterate over files in directory
for path, folders, files in os.walk(directory):
    
    for filename in files:
        '''
        try:
        '''
        counter = counter + 1
        print(counter, '.\n')
        filename = directory + '/' + filename

        # File initialize function        
        data, dataframe, \
        daily_temp, \
        daily_temp_trend_avg, daily_temp_trend_min, daily_temp_trend_max, \
        daily_temp_slope_avg_per_year, daily_temp_result_avg, \
        daily_temp_slope_min_per_year, daily_temp_result_min, \
        daily_temp_slope_max_per_year, daily_temp_result_max, \
        weekly_temp, \
        weekly_temp_trend_avg, weekly_temp_trend_min, weekly_temp_trend_max, \
        weekly_temp_slope_avg_per_year, weekly_temp_result_avg, \
        weekly_temp_slope_min_per_year, weekly_temp_result_min, \
        weekly_temp_slope_max_per_year, weekly_temp_result_max,\
        monthly_temp, \
        monthly_temp_trend_avg, monthly_temp_trend_min, monthly_temp_trend_max, \
        monthly_temp_slope_avg_per_year, monthly_temp_result_avg, \
        monthly_temp_slope_min_per_year, monthly_temp_result_min, \
        monthly_temp_slope_max_per_year, monthly_temp_result_max, \
        daily_rh, \
        daily_rh_trend_avg, daily_rh_trend_min, daily_rh_trend_max, \
        daily_rh_slope_avg_per_year, daily_rh_result_avg, \
        daily_rh_slope_min_per_year, daily_rh_result_min, \
        daily_rh_slope_max_per_year, daily_rh_result_max, \
        weekly_rh, \
        weekly_rh_trend_avg, weekly_rh_trend_min, weekly_rh_trend_max, \
        weekly_rh_slope_avg_per_year, weekly_rh_result_avg, \
        weekly_rh_slope_min_per_year, weekly_rh_result_min, \
        weekly_rh_slope_max_per_year, weekly_rh_result_max,\
        monthly_rh, \
        monthly_rh_trend_avg, monthly_rh_trend_min, monthly_rh_trend_max, \
        monthly_rh_slope_avg_per_year, monthly_rh_result_avg, \
        monthly_rh_slope_min_per_year, monthly_rh_result_min, \
        monthly_rh_slope_max_per_year, monthly_rh_result_max, \
        bad_days, bad_weeks, bad_months, \
        valid_years \
        = file_initialize(filename, time_name, rh_name, td_name)
        print('Initialization complete\n- - -\n')
        
        # File writing function (complete file)
        write_csv(os.path.basename(filename), data)
        print('Writing complete\n- - -\n')
        '''
        # Selecting windows function
        find_valid_windows(data)
        '''
        # Asking the user for inputs of the start and end of the window
        window_start = input('\na) What is your start time? --> ')
        if window_start == 'all':
            window_start = data[std_time_name].index.min()
            window_start = data.loc[window_start, std_time_name]
            print('\nstart:', window_start)
            window_end = data[std_time_name].index.max()
            window_end = data.loc[window_end, std_time_name]
            print('end:', window_end, '\n')

        else:
            window_end = input('b) What is your end time? -->   ')
            print('\n')

        # Asking the user what type of plots they would like to have.
        print('You can make weekly, and monthly plots of your data. Input \'hourly\', \'daily\', \'weekly\', \'monthly\', or \'all\' into the following pop-up.')
        
        plot_type = input('\na) What type of plot would you like? --> ')

        if plot_type == 'hourly':
            # Hourly Plotting
            plot_hourly_temp(filename, data, window_start, window_end)
            plot_hourly_rh(filename, data, window_start, window_end)

        elif plot_type == 'daily':
            # Daily Plotting
            plot_daily_temp(filename, 
                    daily_temp, 
                    daily_temp_trend_avg, daily_temp_trend_min, daily_temp_trend_max, 
                    daily_temp_slope_avg_per_year, daily_temp_result_avg,
                    daily_temp_slope_min_per_year, daily_temp_result_min,
                    daily_temp_slope_max_per_year, daily_temp_result_max,
                    bad_days, 
                    window_start, window_end
                   )
            plot_daily_rh(filename, 
                  daily_rh,
                  daily_rh_trend_avg, daily_rh_trend_min, daily_rh_trend_max, 
                  daily_rh_slope_avg_per_year, daily_rh_result_avg,
                  daily_rh_slope_min_per_year, daily_rh_result_min,
                  daily_rh_slope_max_per_year, daily_rh_result_max,
                  bad_days, 
                  window_start, window_end
                 )
        
        elif plot_type == 'weekly':
            # Weekly Plotting
            plot_weekly_temp(filename, 
                     weekly_temp, 
                     weekly_temp_trend_avg, weekly_temp_trend_min, weekly_temp_trend_max, 
                     weekly_temp_slope_avg_per_year, weekly_temp_result_avg,
                     weekly_temp_slope_min_per_year, weekly_temp_result_min,
                     weekly_temp_slope_max_per_year, weekly_temp_result_max,
                     bad_weeks, 
                     window_start, window_end
                    )
            plot_weekly_rh(filename, 
                   weekly_rh, 
                   weekly_rh_trend_avg, weekly_rh_trend_min, weekly_rh_trend_max, 
                   weekly_rh_slope_avg_per_year, weekly_rh_result_avg,
                   weekly_rh_slope_min_per_year, weekly_rh_result_min,
                   weekly_rh_slope_max_per_year, weekly_rh_result_max,
                   bad_weeks, 
                   window_start, window_end
                   )

        elif plot_type == 'monthly':
            # Monthly Plotting        
            plot_monthly_temp(filename, 
                              monthly_temp, 
                              monthly_temp_trend_avg, monthly_temp_trend_min, monthly_temp_trend_max, 
                              monthly_temp_slope_avg_per_year, monthly_temp_result_avg,
                              monthly_temp_slope_min_per_year, monthly_temp_result_min,
                              monthly_temp_slope_max_per_year, monthly_temp_result_max,
                              bad_weeks, 
                              window_start, window_end
                             )
            plot_monthly_rh(filename, 
                            monthly_rh, 
                            monthly_rh_trend_avg, monthly_rh_trend_min, monthly_rh_trend_max, 
                            monthly_rh_slope_avg_per_year, monthly_rh_result_avg,
                            monthly_rh_slope_min_per_year, monthly_rh_result_min,
                            monthly_rh_slope_max_per_year, monthly_rh_result_max,
                            bad_weeks, 
                            window_start, window_end
                           )

        elif plot_type == 'all':
            # Hourly Plotting
            plot_hourly_temp(filename, data, window_start, window_end)
            plot_hourly_rh(filename, data, window_start, window_end)
            # Daily Plotting
            plot_daily_temp(filename, 
                    daily_temp, 
                    daily_temp_trend_avg, daily_temp_trend_min, daily_temp_trend_max, 
                    daily_temp_slope_avg_per_year, daily_temp_result_avg,
                    daily_temp_slope_min_per_year, daily_temp_result_min,
                    daily_temp_slope_max_per_year, daily_temp_result_max,
                    bad_days, 
                    window_start, window_end
                   )
            plot_daily_rh(filename, 
                  daily_rh,
                  daily_rh_trend_avg, daily_rh_trend_min, daily_rh_trend_max, 
                  daily_rh_slope_avg_per_year, daily_rh_result_avg,
                  daily_rh_slope_min_per_year, daily_rh_result_min,
                  daily_rh_slope_max_per_year, daily_rh_result_max,
                  bad_days, 
                  window_start, window_end
                 )
            # Weekly Plotting
            plot_weekly_temp(filename, 
                     weekly_temp, 
                     weekly_temp_trend_avg, weekly_temp_trend_min, weekly_temp_trend_max, 
                     weekly_temp_slope_avg_per_year, weekly_temp_result_avg,
                     weekly_temp_slope_min_per_year, weekly_temp_result_min,
                     weekly_temp_slope_max_per_year, weekly_temp_result_max,
                     bad_weeks, 
                     window_start, window_end
                    )
            plot_weekly_rh(filename, 
                   weekly_rh, 
                   weekly_rh_trend_avg, weekly_rh_trend_min, weekly_rh_trend_max, 
                   weekly_rh_slope_avg_per_year, weekly_rh_result_avg,
                   weekly_rh_slope_min_per_year, weekly_rh_result_min,
                   weekly_rh_slope_max_per_year, weekly_rh_result_max,
                   bad_weeks, 
                   window_start, window_end
                   )
            # Monthly Plotting        
            plot_monthly_temp(filename, 
                              monthly_temp, 
                              monthly_temp_trend_avg, monthly_temp_trend_min, monthly_temp_trend_max, 
                              monthly_temp_slope_avg_per_year, monthly_temp_result_avg,
                              monthly_temp_slope_min_per_year, monthly_temp_result_min,
                              monthly_temp_slope_max_per_year, monthly_temp_result_max,
                              bad_weeks, 
                              window_start, window_end
                             )
            plot_monthly_rh(filename, 
                            monthly_rh, 
                            monthly_rh_trend_avg, monthly_rh_trend_min, monthly_rh_trend_max, 
                            monthly_rh_slope_avg_per_year, monthly_rh_result_avg,
                            monthly_rh_slope_min_per_year, monthly_rh_result_min,
                            monthly_rh_slope_max_per_year, monthly_rh_result_max,
                            bad_weeks, 
                            window_start, window_end
                           )
            # Histogram
            plot_hist_temp(filename, data, window_start, window_end)
            plot_hist_rh(filename, data, window_start, window_end)            
            # Histogram per year
            plot_temp_histogram_per_year(filename, dataframe, valid_years)
            plot_rh_histogram_per_year(filename, dataframe, valid_years)
            # Hourly Average Day and Night

            # Daily Average Day and Night

            # Weekly Average Day and Night

            # Monthly Average Day and Night

            # Daily Day and Night Difference

            # Weekly Day and Night Difference

            # Monthly Day and Night Difference

            # Hourly Deviation from Mean

            # Daily Deviation from Mean

            # Weekly Deviation from Mean

            # Monthly Deviation from Mean

        else:
            print('You may have mistyped the input, so we just picked \'all\' for you.')
            # Hourly Plotting
            plot_hourly_temp(filename, data, window_start, window_end)
            plot_hourly_rh(filename, data, window_start, window_end)
            # Daily Plotting
            plot_daily_temp(filename, 
                    daily_temp, 
                    daily_temp_trend_avg, daily_temp_trend_min, daily_temp_trend_max, 
                    daily_temp_slope_avg_per_year, daily_temp_result_avg,
                    daily_temp_slope_min_per_year, daily_temp_result_min,
                    daily_temp_slope_max_per_year, daily_temp_result_max,
                    bad_days, 
                    window_start, window_end
                   )
            plot_daily_rh(filename, 
                  daily_rh,
                  daily_rh_trend_avg, daily_rh_trend_min, daily_rh_trend_max, 
                  daily_rh_slope_avg_per_year, daily_rh_result_avg,
                  daily_rh_slope_min_per_year, daily_rh_result_min,
                  daily_rh_slope_max_per_year, daily_rh_result_max,
                  bad_days, 
                  window_start, window_end
                 )
            # Weekly Plotting
            plot_weekly_temp(filename, 
                     weekly_temp, 
                     weekly_temp_trend_avg, weekly_temp_trend_min, weekly_temp_trend_max, 
                     weekly_temp_slope_avg_per_year, weekly_temp_result_avg,
                     weekly_temp_slope_min_per_year, weekly_temp_result_min,
                     weekly_temp_slope_max_per_year, weekly_temp_result_max,
                     bad_weeks, 
                     window_start, window_end
                    )
            plot_weekly_rh(filename, 
                   weekly_rh, 
                   weekly_rh_trend_avg, weekly_rh_trend_min, weekly_rh_trend_max, 
                   weekly_rh_slope_avg_per_year, weekly_rh_result_avg,
                   weekly_rh_slope_min_per_year, weekly_rh_result_min,
                   weekly_rh_slope_max_per_year, weekly_rh_result_max,
                   bad_weeks, 
                   window_start, window_end
                   )
            # Monthly Plotting        
            plot_monthly_temp(filename, 
                              monthly_temp, 
                              monthly_temp_trend_avg, monthly_temp_trend_min, monthly_temp_trend_max, 
                              monthly_temp_slope_avg_per_year, monthly_temp_result_avg,
                              monthly_temp_slope_min_per_year, monthly_temp_result_min,
                              monthly_temp_slope_max_per_year, monthly_temp_result_max,
                              bad_weeks, 
                              window_start, window_end
                             )
            plot_monthly_rh(filename, 
                            monthly_rh, 
                            monthly_rh_trend_avg, monthly_rh_trend_min, monthly_rh_trend_max, 
                            monthly_rh_slope_avg_per_year, monthly_rh_result_avg,
                            monthly_rh_slope_min_per_year, monthly_rh_result_min,
                            monthly_rh_slope_max_per_year, monthly_rh_result_max,
                            bad_weeks, 
                            window_start, window_end
                           )
                
        print('Plotting complete\n')
        
        completed.append(os.path.basename(filename))

        '''
        except:
            print('An error occured. File skipped.\n')
            skipped.append(os.path.basename(filename))
        '''

print('---------------------------\n\nPROCESSING INFORMATION\n')
print('Files processed:\n', completed, '\n')
print('Files skipped due to error:\n', skipped)