In [None]:
# Welcome to the Lahmas Lab Lascar Data Processor!
# Please answer the questions below and run your code directly in a jupyter notebook.

# Where is the folder you would like to process?
folder_path = r'lascar_data' # Use 'copy path' within Jupyter Lab

# What are the labels for each variable in the .csv files?
# NOTE ! : If the label has a speciel character in it you may have to open the file in 'Editor' to copy and paste the symbol.
time_name = 'Time' # Label of time variable
temp_name = 'T' # Label of temperature variable
rh_name = 'RH' # Label of relative humidity variable
td_name = 'Td' # Label of dew point temperature variable

# What is the date format? (eg. %d/%m/%Y %H:%M:%S)
date_format = '%Y-%m-%d %H:%M:%S'

# What will the name be of your new files folder?
new_files_name = 'lascar_files'

# What will the name be of your new plots folder?
new_plots_name = 'lascar_plots'

In [None]:
# Importing all necessary modules

import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
from datetime import datetime
import io

In [None]:
# Global Variable Editor - DO NOT CHANGE UNLESS YOU ARE SURE
# This block holds all standardized global variables and formatting styles for the output files.

# Standard Date Format
std_date_format = '%Y/%m/%d %H:%M:%S'

# To add:
std_time_name = 'Datetime (YYYY/MM/DD HH:mm:ss)'
std_temp_name = 'Temperature (deg C)'
std_rh_name = 'RH (%)'
std_td_name = 'Dew Point (deg C)'
rh_cor_name = 'RH Corrected (%)'
td_cor_name = 'Dew Point Corrected (°C)'

In [None]:
# Creating the new folder where all the processed files will end up.

new_files_name = '1_' + new_files_name

if os.path.exists(new_files_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(new_files_name)
    print('\nYour new folder is:\n"', new_files_name, '"\n\nRun next block\n')

In [None]:
# Creating the new folder where all the plots will end up.

new_plots_name = '2_' + new_plots_name

if os.path.exists(new_plots_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(new_plots_name)
    print('\nYour new folder is:\n"', new_plots_name, '"\n\nRun next block\n')

In [None]:
# File initialization function

def file_initialize(file_path, time_name, rh_name, td_name):

    ### INITIALIZATION
    
    # Prints selected file name
    print("Initializing file:", os.path.basename(file_path), '\n')

    # Converting .csv in ANSI encoding to UTF-8 encoding
    try:
        df = pd.read_csv(file_path, encoding='ANSI')
        df.to_csv(file_path, encoding='utf-8', index=False)
        dataframe = pd.read_csv(file_path)

    except:
        dataframe = pd.read_csv(file_path)
    
    # Skips first values as they may have been taken & tainted during installation
    dataframe = dataframe.iloc[5:].reset_index(drop=True)
    
    # Changes time string to datetime type
    dataframe['Time_fixed'] = pd.to_datetime(dataframe[time_name], format = date_format)
    
    # Changing date format to standard
    dataframe['Time_fixed'] = pd.to_datetime(dataframe['Time_fixed']).dt.strftime(std_date_format)
    
    # Updating the time column to standard datetime format
    dataframe[time_name] = dataframe['Time_fixed']
    
    
    ## RH and Td correction
    
    # Creating lists to insert corrected RH and Td values
    RH_cor = [None] * len(dataframe)
    Td_cor = [None] * len(dataframe)
    
    # Iterating through rows to update RH values out of 0-100% range and Td values.
    for i in range(0,len(dataframe)):
        if dataframe.loc[i, rh_name] > 100:
            RH_cor[i] = 100 # RH is adjusted to 100% as it must be saturated
            Td_cor[i] = dataframe.loc[i, temp_name] # Td is equal to T
        
        else: # Everything stays the same
            RH_cor[i] = dataframe.loc[i, rh_name]
            Td_cor[i] = dataframe.loc[i, td_name]
            
    # Creating columns for the lists to merge into the dataframe
    dataframe[rh_cor_name] = RH_cor
    dataframe[td_cor_name] = Td_cor
    
    return dataframe

In [None]:
# Function writing a new .csv in your new folder

def write_csv(basename, dataframe):

    # Creating new file path and name
    new_file_path = new_files_name + '/PROCESSED_' + os.path.basename(basename)
    new_file_path = new_file_path.replace('txt', 'csv')
    
    # Prints selected file name
    print('Writing new file:', os.path.basename(new_file_path), '\n')

    # Renaming labels to standard
    dataframe.rename(columns={time_name: std_time_name,
                              temp_name: std_temp_name,
                              rh_name: std_rh_name,
                              td_name: std_td_name}, inplace=True)
    
    # Removing custom index
    dataframe.reset_index(drop=True, inplace=True)
    
    # Writing the new dataframe to your computer
    dataframe.to_csv(new_file_path, index=False, encoding='utf-8-sig')

In [None]:
# Window selector function

def find_valid_windows(dataframe, column_subset=None, min_window_size=1):

    print('Calculating Data Window(s)\n')
    
    if column_subset:
        valid = dataframe[temp_name].notnull().all(axis=1)
    else:
        valid = dataframe.notnull().all(axis=1)

    # Mark where valid changes from False to True or True to False
    is_valid = valid.astype(int)
    group = (is_valid != is_valid.shift()).cumsum()

    windows = []
    for _, g in dataframe.groupby(group):
        if valid[g.index[0]]:  # This is a valid window
            if len(g) >= min_window_size:
                # Get integer positions of start and end
                start_idx = dataframe.index.get_loc(g.index[0])
                end_idx = dataframe.index.get_loc(g.index[-1])
                windows.append((start_idx, end_idx))

    return windows

In [None]:
# Window framer

def window_framer(data, window_start, window_end, time_name, rh_name, td_name):
    ## Initializing daily and monthly temperature averages for plotting

    # adjust the number of rows to skip if needed, to avoid potential abberant data points 
    #dataframe = dataframe.iloc[50:].reset_index(drop=True)
    
    dataframe = data.set_index('Time_fixed', drop=True)
    dataframe.index = pd.to_datetime(dataframe.index).strftime(std_date_format)
    dataframe = dataframe.loc[dataframe.loc[window_start, std_time_name]:dataframe.loc[window_end, std_time_name]]

    # Full range, start-finish
    full_time_index = pd.date_range(start=dataframe.loc[window_start, std_time_name], end=dataframe.loc[window_end, std_time_name], freq='h')
    full_df = pd.DataFrame(index=full_time_index)
    full_df['month'] = full_df.index.to_period('M')
    
    # expected number of data points per month
    expected_counts = full_df.groupby('month').size().sort_index()

    dataframe.index = pd.to_datetime(dataframe.index)
    # actual number of data points per month
    actual_counts = dataframe.index.to_period('M').value_counts().sort_index()
    
    # ratio of missing data
    missing_ratio = 1 - (actual_counts / expected_counts)
    missing_ratio = missing_ratio.to_frame(name='Missing_Ratio')
    
    # adjust what you accept as enough data for a month, here I put 20 days (20 days * 24h)
    min_valid_points_per_month = 24 * 20
    
    start_month = dataframe.index.min().to_period('M')
    end_month = dataframe.index.max().to_period('M')
    
    # filter data to exclude: months with 50%+ missing with less than 20 days of data and nans
    filtered_missing_ratio = missing_ratio[
        (missing_ratio['Missing_Ratio'] <= 0.5) & 
        (~missing_ratio['Missing_Ratio'].isna()) &
        (actual_counts >= min_valid_points_per_month)
        ]
    
    # final valid and bad months
    valid_months = filtered_missing_ratio.index
    bad_months = missing_ratio[~missing_ratio.index.isin(valid_months)]
    
    # Create index using the time column
    dataframe = dataframe.sort_values('Time_fixed')
    
    # Resample by day and calculate daily min, max, avg for temperature
    daily_summary_T = dataframe[std_temp_name].resample('1D').agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_T.columns = [std_time_name, 'T_avg', 'T_min', 'T_max']
    
    daily_summary_T[std_time_name] = pd.to_datetime(daily_summary_T[std_time_name])
    monthly_summary_T = daily_summary_T.resample('ME', on=std_time_name).agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'}).dropna().reset_index() 
    
    monthly_summary_T['MonthPeriod'] = monthly_summary_T[std_time_name].dt.to_period('M')
    
    # filter rows to keep valid data
    monthly_summary_T = monthly_summary_T[monthly_summary_T['MonthPeriod'].isin(valid_months)]
            
    # Resample by day and calculate daily min, max, avg for RH
    daily_summary_rh = dataframe[rh_cor_name].resample('1D')\
        .agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_rh.columns = ['DateTime', 'RH_avg', 'RH_min', 'RH_max']
    
    # Resample by day and calculate daily min, max, avg for RH
    daily_summary_rh = dataframe[std_rh_name].resample('1D').agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_rh.columns = [std_time_name, 'RH_avg', 'RH_min', 'RH_max']
    
    daily_summary_rh[std_time_name] = pd.to_datetime(daily_summary_rh[std_time_name])
    monthly_summary_rh = daily_summary_rh.resample('ME', on=std_time_name).agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'}).dropna().reset_index()
    
    monthly_summary_rh['MonthPeriod'] = monthly_summary_rh[std_time_name].dt.to_period('M')
    monthly_summary_rh = monthly_summary_rh[monthly_summary_rh['MonthPeriod'].isin(valid_months)]

    return monthly_summary_T, monthly_summary_rh, bad_months

In [None]:
# Creating plots from the processed data

# Monthly Temperature Plot
def plot_monthly_temp(filename, monthly_summary_T, bad_months):

    print('Plotting: Monthly average, minimum, and maximum temperatures.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = monthly_summary_T[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = monthly_summary_T[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_avg'], 
        label='Average Temp', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_min'], 
        label='Min Temp', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_max'], 
        label='Max Temp', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly Temperature Summary:')
    plt.xlabel('Date')
    plt.ylabel('Temperature')

    ax = plt.gca()

    # Custom tick dates
    custom_tick_dates = [
        datetime(2006, 1, 1), datetime(2007, 1, 1), datetime(2008, 1, 1),
        datetime(2009, 1, 1), datetime(2010, 1, 1), datetime(2011, 1, 1),
        datetime(2012, 1, 1), datetime(2013, 1, 1), datetime(2014, 1, 1),
        datetime(2015, 1, 1), datetime(2016, 1, 1), datetime(2017, 1, 1),
        datetime(2018, 1, 1), datetime(2019, 1, 1), datetime(2020, 1, 1),
        datetime(2021, 1, 1), datetime(2022, 1, 1), datetime(2023, 1, 1),
        datetime(2024, 1, 1),
    ]
    
    ax.set_xticks(custom_tick_dates)
    
    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months.index:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(new_plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_temp.png'))

# Monthly Relative Humidity Plot
def plot_monthly_rh(filename, monthly_summary_rh, bad_months):

    print('Plotting: Monthly average, minimum, and maximum relative humidity.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = monthly_summary_rh[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = monthly_summary_rh[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max RH
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_avg'], 
        label='Average RH', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_min'], 
        label='Min RH', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_max'], 
        label='Max RH', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly Relative Humidity Summary:')
    plt.xlabel('Date')
    plt.ylabel('Relative Humidity')

    ax = plt.gca()

    # Custom tick dates (same as before)
    custom_tick_dates = [
        datetime(2006, 1, 1), datetime(2007, 1, 1), datetime(2008, 1, 1),
        datetime(2009, 1, 1), datetime(2010, 1, 1), datetime(2011, 1, 1),
        datetime(2012, 1, 1), datetime(2013, 1, 1), datetime(2014, 1, 1),
        datetime(2015, 1, 1), datetime(2016, 1, 1), datetime(2017, 1, 1),
        datetime(2018, 1, 1), datetime(2019, 1, 1), datetime(2020, 1, 1),
        datetime(2021, 1, 1), datetime(2022, 1, 1), datetime(2023, 1, 1),
        datetime(2024, 1, 1),
    ]
    ax.set_xticks(custom_tick_dates)
    
    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months.index:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig2 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig2.set_size_inches(12, 6)

    fig2.savefig(new_plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_rh.png'))

In [None]:
# Walks through the folder and goes through each file one at a time.

directory = folder_path
counter = 0
completed = []
skipped = []

# Iterate over files in directory
for path, folders, files in os.walk(directory):
    
    for filename in files:
        '''
        try:
        '''
        counter = counter + 1
        print(counter, '.\n')
        filename = directory + '/' + filename

        # File initialize function        
        data = file_initialize(filename, time_name, rh_name, td_name)
        print('Initialization complete\n- - -\n')

        # File writing function (complete file)
        write_csv(os.path.basename(filename), data)
        print('Writing complete\n- - -\n')

        # Selecting windows function
        windows = find_valid_windows(data)
        print('The following time windows contain data, please choose a start and end date for visualization (must be on the hour).\n', 
              'You can input \'all\' to visualize everything or eg. \'2003/10/17 21:00:00\'.\n')
        ticker = 0
        
        for i in range(0, len(windows)):
            ticker = ticker + 1
            print('Window', ticker, ': ', data.loc[windows[i][0], std_time_name], 'until', data.loc[windows[i][1], std_time_name], '\t')

        window_start = input('a) What is your start time? --> ')
        if window_start == 'all':
            window_start = data[std_time_name].index.min()
            window_start = data.loc[window_start, std_time_name]
            print('start:', window_start)
            window_end = data[std_time_name].index.max()
            window_end = data.loc[window_end, std_time_name]
            print('end:', window_end, '\n')

        else:
            window_end = input('b) What is your end time? -->   ')
            print('\n')

        # Window framing
        monthly_summary_T, monthly_summary_rh, bad_months = window_framer(data, window_start, window_end, time_name, rh_name, td_name)

        # Plotting function        
        plot_monthly_temp(filename, monthly_summary_T, bad_months)
        plot_monthly_rh(filename, monthly_summary_rh, bad_months)
        print('Plotting complete\n')
        
        completed.append(os.path.basename(filename))
        '''
        except:
            print('An error occured. File skipped.\n')
            skipped.append(os.path.basename(filename))
        '''

print('---------------------------\n\nPROCESSING INFORMATION\n')
print('Files processed:\n', completed, '\n')
print('Files skipped due to error:\n', skipped)