In [None]:
# Welcome to the Lahmas Lab Lascar Data Processor!
# Please answer the questions below and run your code directly in a jupyter notebook.

# Where is the folder you would like to process?
folder_path = r'' # Use 'copy path' within Jupyter Lab

# What are the labels for each variable in the .csv files?
# NOTE ! : If the label has a speciel character in it you may have to open the file in 'Editor' to copy and paste the symbol.
time_name = '' # Label of time variable
temp_name = '' # Label of temperature variable
rh_name = '' # Label of relative humidity variable
td_name = '' # Label of dew point temperature variable

# What is the date format? (eg. %Y-%m-%d %H:%M:%S)
date_format = ''

# What will the name be of your new files folder?
new_files_name = ''

# What will the name be of your new plots folder?
new_plots_name = ''

In [None]:
# Importing all necessary modules

import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
from datetime import datetime
import io

In [None]:
# Global Variable Editor - DO NOT CHANGE UNLESS YOU ARE SURE
# This block holds all standardized global variables and formatting styles for the output files.

# Standard Date Format
std_date_format = '%Y-%m-%d %H:%M:%S'

# Dataframe Label Names
std_time_name = 'Datetime (YYYY-MM-DD HH:mm:ss)'
std_temp_name = 'Temperature (deg C)'
std_rh_name = 'RH (%)'
std_td_name = 'Dew Point (deg C)'
rh_cor_name = 'RH Corrected (%)'
td_cor_name = 'Dew Point Corrected (deg C)'

# Plot labels
plot_time_label = 'Time (YYYY/MM)'
plot_temp_label = std_temp_name
plot_rh_label = std_rh_name

# Time Index Name
time_index = 'Time Index'

In [None]:
# Creating the new folder where all the processed files will end up.

new_files_name = '1_' + new_files_name

if os.path.exists(new_files_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(new_files_name)
    print('\nYour new folder is:\n"', new_files_name, '"\n\nRun next block\n')

In [None]:
# Creating the new folder where all the plots will end up.

new_plots_name = '2_' + new_plots_name

if os.path.exists(new_plots_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(new_plots_name)
    print('\nYour new folder is:\n"', new_plots_name, '"\n\nRun next block\n')

In [None]:
# File initialization function

def file_initialize(file_path, time_name, rh_name, td_name):

    ### INITIALIZATION
    
    # Prints selected file name
    print("Initializing file:", os.path.basename(file_path), '\n')

    # Converting .csv in ANSI encoding to UTF-8 encoding
    try:
        df = pd.read_csv(file_path, encoding='ANSI')
        df.to_csv(file_path, encoding='utf-8', index=False)
        dataframe = pd.read_csv(file_path)

    except:
        dataframe = pd.read_csv(file_path)
    
    # Skips first values as they may have been taken & tainted during installation
    dataframe = dataframe.iloc[5:].reset_index(drop=True)
    
    # Changes time string to datetime type
    dataframe[time_index] = pd.to_datetime(dataframe[time_name], format = date_format)
    
    # Changing date format to standard
    dataframe[time_index] = pd.to_datetime(dataframe[time_index]).dt.strftime(std_date_format)
    
    # Updating the time column to standard datetime format
    dataframe[time_name] = dataframe[time_index]
    
    
    ## RH and Td correction
    
    # Creating lists to insert corrected RH and Td values
    RH_cor = [None] * len(dataframe)
    Td_cor = [None] * len(dataframe)
    
    # Iterating through rows to update RH values out of 0-100% range and Td values.
    for i in range(0,len(dataframe)):
        if dataframe.loc[i, rh_name] > 100:
            RH_cor[i] = 100 # RH is adjusted to 100% as it must be saturated
            Td_cor[i] = dataframe.loc[i, temp_name] # Td is equal to T
        
        else: # Everything stays the same
            RH_cor[i] = dataframe.loc[i, rh_name]
            Td_cor[i] = dataframe.loc[i, td_name]
            
    # Creating columns for the lists to merge into the dataframe
    dataframe[rh_cor_name] = RH_cor
    dataframe[td_cor_name] = Td_cor
    
    return dataframe

In [None]:
# Function writing a new .csv in your new folder

def write_csv(basename, dataframe):

    # Creating new file path and name
    new_file_path = new_files_name + '/PROCESSED_' + os.path.basename(basename)
    new_file_path = new_file_path.replace('txt', 'csv')
    
    # Prints selected file name
    print('Writing new file:', os.path.basename(new_file_path), '\n')

    # Renaming labels to standard
    dataframe.rename(columns={time_name: std_time_name,
                              temp_name: std_temp_name,
                              rh_name: std_rh_name,
                              td_name: std_td_name}, inplace=True)
    
    # Removing custom index
    dataframe.reset_index(drop=True, inplace=True)
    
    # Writing the new dataframe to your computer
    dataframe.to_csv(new_file_path, index=False, encoding='utf-8-sig')

In [None]:
# Window selector function

def find_valid_windows(dataframe, column_subset=None, min_window_size=1):

    print('Calculating Data Window(s)\n')

    print('The following time windows contain data, please choose a start and end date for visualization (must be on the hour).\n',
          'You can input \'all\' to visualize everything or eg. \'2003/10/17 21:00:00\'.\n')
    
    # Step 1: Ensure datetime index and sort it
    dataframe = dataframe.set_index(time_index, drop=True)
    dataframe.index = pd.to_datetime(dataframe.index).strftime(std_date_format)
    dataframe.index = pd.DatetimeIndex(dataframe.index)
    
    # Step 2: Compute time differences
    time_diffs = dataframe.index.to_series().diff()
    
    # Step 3: Identify breaks (difference not equal to 1 hour)
    one_hour = pd.Timedelta(hours=1)
    breaks = time_diffs != one_hour
    
    # Step 4: Assign group IDs
    group_id = breaks.cumsum()
    
    # Step 5: Group by ID and extract start/end of each group
    sequences = dataframe.groupby(group_id).apply(lambda g: (g.index[0], g.index[-1]))
    
    # Step 6: Convert to list of (start, end) tuples
    continuous_sequences = sequences.tolist()
    
    # Output the sequences
    ticker = 0
    
    for start, end in continuous_sequences:
        if pd.isnull(start):
            continue
        else:
            #print(f"Start: {start}, End: {end}")
            ticker = ticker + 1
            print('Window', ticker, ': ', start, 'until', end, '\t')
    
    return

In [None]:
# Window framer - REALLY NEEDS CLEANING UP !!!

def window_framer(data, window_start, window_end):
    ## Initializing daily and monthly temperature averages for plotting

    # adjust the number of rows to skip if needed, to avoid potential abberant data points 
    #dataframe = dataframe.iloc[50:].reset_index(drop=True)

    dataframe = data.set_index(time_index, drop=True)
    dataframe.index = pd.to_datetime(dataframe.index).strftime(std_date_format)
    dataframe = dataframe.loc[dataframe.loc[window_start, std_time_name]:dataframe.loc[window_end, std_time_name]]
    
    # Define full time index range at hourly resolution
    full_time_index = pd.date_range(start=dataframe.loc[window_start, std_time_name], end=dataframe.loc[window_end, std_time_name], freq='h')
    
    # Create a temporary dataframe to check timestamp presence
    timestamp_df = pd.DataFrame(index=full_time_index)
    timestamp_df['present'] = timestamp_df.index.isin(dataframe.index)
    
    #%% Only keeping months with 50%+ data
    timestamp_df['month'] = timestamp_df.index.to_period('M')
    
    # Actual present timestamps per month
    dataframe.index = pd.to_datetime(dataframe.index)
    actual_counts_monthly = dataframe.index.to_period('M').value_counts().sort_index()
    expected_counts_monthly = timestamp_df.groupby('month').size().sort_index()
    
    missing_ratio_monthly = 1 - (actual_counts_monthly / expected_counts_monthly)
    missing_ratio_monthly = missing_ratio_monthly.to_frame(name='Missing_Ratio')
    
    min_valid_points_per_month = 24 * 20  # 20 days
    
    # Filter months with ≤50% missing and ≥ min points
    filtered_missing_ratio_monthly = missing_ratio_monthly[
        (missing_ratio_monthly['Missing_Ratio'] <= 0.5) &
        (actual_counts_monthly >= min_valid_points_per_month)
    ]
    
    valid_months = filtered_missing_ratio_monthly.index
    bad_months = missing_ratio_monthly[~missing_ratio_monthly.index.isin(valid_months)]
    
    #%% Only keeping weeks with 4+ days of data (96+ hourly points)
    
    timestamp_df['week'] = timestamp_df.index.to_period('W')
    
    actual_counts_weekly = dataframe.index.to_period('W').value_counts().sort_index()
    expected_counts_weekly = timestamp_df.groupby('week').size().sort_index()
    
    missing_ratio_weekly = 1 - (actual_counts_weekly / expected_counts_weekly)
    missing_ratio_weekly = missing_ratio_weekly.to_frame(name='Missing_Ratio')
    
    min_valid_points_per_week = 96  # 4 days
    
    filtered_missing_ratio_weekly = missing_ratio_weekly[
        (missing_ratio_weekly['Missing_Ratio'] <= 0.5) &
        (actual_counts_weekly >= min_valid_points_per_week)
    ]
    
    valid_weeks = filtered_missing_ratio_weekly.index
    bad_weeks = missing_ratio_weekly[~missing_ratio_weekly.index.isin(valid_weeks)]

    dataframe.index = pd.DatetimeIndex(dataframe.index)
    
    # Resample by day and calculate daily min, max, avg for temperature
    daily_summary_T = dataframe[std_temp_name].resample('D').agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_T.columns = [std_time_name, 'T_avg', 'T_min', 'T_max']
    
    daily_summary_T[std_time_name] = pd.to_datetime(daily_summary_T[std_time_name])
    monthly_summary_T = daily_summary_T.resample('ME', on=std_time_name).agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'}).dropna().reset_index() 
    
    monthly_summary_T['MonthPeriod'] = monthly_summary_T[std_time_name].dt.to_period('M')
    
    # filter rows to keep valid data
    monthly_summary_T = monthly_summary_T[monthly_summary_T['MonthPeriod'].isin(valid_months)]


    weekly_summary_T = daily_summary_T.resample('W', on=std_time_name).agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'}).dropna().reset_index() 
    
    weekly_summary_T['WeekPeriod'] = weekly_summary_T[std_time_name].dt.to_period('W')
    
    # filter rows to keep valid data
    weekly_summary_T = weekly_summary_T[weekly_summary_T['WeekPeriod'].isin(valid_weeks)]
    
    # Resample by day and calculate daily min, max, avg for RH
    daily_summary_rh = dataframe[std_rh_name].resample('D').agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_rh.columns = [std_time_name, 'RH_avg', 'RH_min', 'RH_max']
    
    daily_summary_rh[std_time_name] = pd.to_datetime(daily_summary_rh[std_time_name])
    monthly_summary_rh = daily_summary_rh.resample('ME', on=std_time_name).agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'}).dropna().reset_index()
    
    monthly_summary_rh['MonthPeriod'] = monthly_summary_rh[std_time_name].dt.to_period('M')

    # filter rows to keep valid data
    monthly_summary_rh = monthly_summary_rh[monthly_summary_rh['MonthPeriod'].isin(valid_months)]

    weekly_summary_rh = daily_summary_rh.resample('W', on=std_time_name).agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'}).dropna().reset_index() 
    
    weekly_summary_rh['WeekPeriod'] = weekly_summary_rh[std_time_name].dt.to_period('W')
    
    # filter rows to keep valid data
    weekly_summary_rh = weekly_summary_rh[weekly_summary_rh['WeekPeriod'].isin(valid_weeks)]

    return monthly_summary_T, monthly_summary_rh, weekly_summary_T, weekly_summary_rh, bad_months, bad_weeks

In [None]:
# Weekly plotting functions

# Monthly Temperature Plot
def plot_weekly_temp(filename, weekly_summary_T, bad_weeks):

    print('Plotting: Weekly average, minimum, and maximum temperatures.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = weekly_summary_T[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = weekly_summary_T[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max
    plt.plot(
        weekly_summary_T[std_time_name], 
        weekly_summary_T['T_avg'], 
        label='Average Temp', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        weekly_summary_T[std_time_name], 
        weekly_summary_T['T_min'], 
        label='Min Temp', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        weekly_summary_T[std_time_name], 
        weekly_summary_T['T_max'], 
        label='Max Temp', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_weeks.index:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(new_plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'week_temp.png'))

# Monthly Relative Humidity Plot
def plot_weekly_rh(filename, weekly_summary_rh, bad_weeks):

    print('Plotting: Weekly average, minimum, and maximum relative humidity.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = weekly_summary_rh[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = weekly_summary_rh[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max RH
    plt.plot(
        weekly_summary_rh[std_time_name], 
        weekly_summary_rh['RH_avg'], 
        label='Average RH', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        weekly_summary_rh[std_time_name], 
        weekly_summary_rh['RH_min'], 
        label='Min RH', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        weekly_summary_rh[std_time_name], 
        weekly_summary_rh['RH_max'], 
        label='Max RH', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_weeks.index:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig2 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig2.set_size_inches(12, 6)

    fig2.savefig(new_plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'week_rh.png'))

In [None]:
# Monthly plotting functions

# Monthly Temperature Plot
def plot_monthly_temp(filename, monthly_summary_T, bad_months):

    print('Plotting: Monthly average, minimum, and maximum temperatures.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = monthly_summary_T[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = monthly_summary_T[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_avg'], 
        label='Average Temp', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_min'], 
        label='Min Temp', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_max'], 
        label='Max Temp', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months.index:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(new_plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_temp.png'))

# Monthly Relative Humidity Plot
def plot_monthly_rh(filename, monthly_summary_rh, bad_months):

    print('Plotting: Monthly average, minimum, and maximum relative humidity.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = monthly_summary_rh[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = monthly_summary_rh[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max RH
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_avg'], 
        label='Average RH', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_min'], 
        label='Min RH', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_max'], 
        label='Max RH', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months.index:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig2 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig2.set_size_inches(12, 6)

    fig2.savefig(new_plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_rh.png'))

In [None]:
# Walks through the folder and goes through each file one at a time.

directory = folder_path
counter = 0
completed = []
skipped = []

# Iterate over files in directory
for path, folders, files in os.walk(directory):
    
    for filename in files:
        '''
        try:
        '''
        counter = counter + 1
        print(counter, '.\n')
        filename = directory + '/' + filename

        # File initialize function        
        data = file_initialize(filename, time_name, rh_name, td_name)
        print('Initialization complete\n- - -\n')
        
        # File writing function (complete file)
        write_csv(os.path.basename(filename), data)
        print('Writing complete\n- - -\n')
        
        # Selecting windows function
        find_valid_windows(data)
        
        # Asking the user for inputs of the start and end of the window    
        window_start = input('\na) What is your start time? --> ')
        if window_start == 'all':
            window_start = data[std_time_name].index.min()
            window_start = data.loc[window_start, std_time_name]
            print('\nstart:', window_start)
            window_end = data[std_time_name].index.max()
            window_end = data.loc[window_end, std_time_name]
            print('end:', window_end, '\n')

        else:
            window_end = input('b) What is your end time? -->   ')
            print('\n')

        # Window framing - Initializing daily, weekly, monthly summary dataframes based on the window frame
        monthly_summary_T, monthly_summary_rh, weekly_summary_T, weekly_summary_rh, bad_months, bad_weeks\
        = window_framer(data, window_start, window_end)

        # Asking the user what type of plots they would like to have.
        print('You can make weekly, and monthly plots of your data. Input \'weekly\', \'monthly\', or \'all\' into the following pop-up.')

        
        plot_type = input('\na) What type of plot would you like? --> ')
        
        if plot_type == 'weekly':
            # Weekly Plotting
            plot_weekly_temp(filename, weekly_summary_T, bad_weeks)
            plot_weekly_rh(filename, weekly_summary_rh, bad_weeks)

        elif plot_type == 'monthly':
            # Monthly Plotting        
            plot_monthly_temp(filename, monthly_summary_T, bad_months)
            plot_monthly_rh(filename, monthly_summary_rh, bad_months)

        elif plot_type == 'all':
            # Weekly Plotting
            plot_weekly_temp(filename, weekly_summary_T, bad_weeks)
            plot_weekly_rh(filename, weekly_summary_rh, bad_weeks)
            # Monthly Plotting        
            plot_monthly_temp(filename, monthly_summary_T, bad_months)
            plot_monthly_rh(filename, monthly_summary_rh, bad_months)

        else:
            print('You may have mistyped the input, so we just picked \'all\' for you.')
            # Weekly Plotting
            plot_weekly_temp(filename, weekly_summary_T, bad_weeks)
            plot_weekly_rh(filename, weekly_summary_rh, bad_weeks)
            # Monthly Plotting        
            plot_monthly_temp(filename, monthly_summary_T, bad_months)
            plot_monthly_rh(filename, monthly_summary_rh, bad_months)
                
        print('Plotting complete\n')
        
        completed.append(os.path.basename(filename))

        '''
        except:
            print('An error occured. File skipped.\n')
            skipped.append(os.path.basename(filename))
        '''

print('---------------------------\n\nPROCESSING INFORMATION\n')
print('Files processed:\n', completed, '\n')
print('Files skipped due to error:\n', skipped)