In [None]:
# Welcome to the Lahmas Lab Lascar Data Processor!
# Please answer the questions below and run your code directly in a jupyter notebook.

# Where is the folder you would like to process?
folder_path = r'lascar_data' # Use 'copy path' within Jupyter Lab

# What are the labels for each variable in the .csv files?
# NOTE ! : If the label has a speciel character in it you may have to open the file in 'Editor' to copy and paste the symbol.
time_name = 'Time' # Label of time variable
temp_name = 'T' # Label of temperature variable
rh_name = 'RH' # Label of relative humidity variable
td_name = 'Td' # Label of dew point temperature variable

# What is the date format? (eg. %Y-%m-%d %H:%M:%S)
date_format = '%Y-%m-%d %H:%M:%S'

# What will the name be of your new files folder?
new_files_name = 'lascar_files'

# What will the name be of your new plots folder?
new_plots_name = 'lascar_plots'

In [None]:
# Importing all necessary modules

import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
from datetime import datetime
import io

In [None]:
# Global Variable Editor - DO NOT CHANGE UNLESS YOU ARE SURE
# This block holds all standardized global variables and formatting styles for the output files.

# Standard Date Format
std_date_format = '%Y-%m-%d %H:%M:%S'

# Dataframe Label Names
std_time_name = 'Datetime (YYYY-MM-DD HH:mm:ss)'
std_temp_name = 'Temperature (deg C)'
std_rh_name = 'RH (%)'
std_td_name = 'Dew Point (deg C)'
rh_cor_name = 'RH Corrected (%)'
td_cor_name = 'Dew Point Corrected (deg C)'

# Plot labels
plot_time_label = 'Time (YYYY/MM)'
plot_temp_label = std_temp_name
plot_rh_label = std_rh_name

# Time Index Name
time_index = 'Time Index'

# Daily minimum points for validity (100% = 24; 75% = 18; 50% = 12; ...)
daily_min_points = 18 # 75%

# Weekly minimum points for validity (100% = 168; 75% = 126; 50% = 84; ...)
weekly_min_points = 84 # 50%

# Monthly minimum points for validity (for 30 day month --> 100% = 720; 75% = 540; 50% = 360; ...)
monthly_min_points = 360 # 50%

In [None]:
# Creating the new folder where all the processed files will end up.

files_name = '1_' + new_files_name

if os.path.exists(files_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(files_name)
    print('\nYour new folder is:\n"', files_name, '"\n\nRun next block\n')

In [None]:
# Creating the new folder where all the plots will end up.

plots_name = '2_' + new_plots_name

if os.path.exists(plots_name):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(plots_name)
    print('\nYour new folder is:\n"', plots_name, '"\n\nRun next block\n')

In [None]:
# File initialization function

def file_initialize(file_path, time_name, rh_name, td_name):

    ### INITIALIZATION
    
    # Prints selected file name
    print("Initializing file:", os.path.basename(file_path), '\n')

    # Converting .csv in ANSI encoding to UTF-8 encoding
    try:
        df = pd.read_csv(file_path, encoding='ANSI')
        df.to_csv(file_path, encoding='utf-8', index=False)
        data = pd.read_csv(file_path)

    except:
        data = pd.read_csv(file_path)
    
    # Skips first values as they may have been taken & tainted during installation
    data = data.iloc[5:].reset_index(drop=True)
    
    # Changes time string to datetime type
    data[time_index] = pd.to_datetime(data[time_name], format = date_format)
    
    # Changing date format to standard
    data[time_index] = pd.to_datetime(data[time_index]).dt.strftime(std_date_format)
    
    # Updating the time column to standard datetime format
    data[time_name] = data[time_index]
    
    
    ## RH AND TD CORRECTION
    
    # Creating lists to insert corrected RH and Td values
    RH_cor = [None] * len(data)
    Td_cor = [None] * len(data)
    
    # Iterating through rows to update RH values out of 0-100% range and Td values.
    for i in range(0,len(data)):
        if data.loc[i, rh_name] > 100:
            RH_cor[i] = 100 # RH is adjusted to 100% as it must be saturated
            Td_cor[i] = data.loc[i, temp_name] # Td is equal to T
        
        else: # Everything stays the same
            RH_cor[i] = data.loc[i, rh_name]
            Td_cor[i] = data.loc[i, td_name]
            
    # Creating columns for the lists to merge into the dataframe
    data[rh_cor_name] = RH_cor
    data[td_cor_name] = Td_cor


    ## RENAMING LABELS TO STANDARD
    dataframe = data
    dataframe.rename(columns={time_name: std_time_name,
                              temp_name: std_temp_name,
                              rh_name: std_rh_name,
                              td_name: std_td_name}, inplace=True)
    

    ## SETTING UP INDEXING FOR SUMMARY CALCULATIONS

    # Creating initial index
    dataframe[time_index] = pd.to_datetime(dataframe[std_time_name])
    dataframe = dataframe.set_index(time_index, drop=False)
    
    dataframe = dataframe[~dataframe.index.duplicated(keep='first')]
    
    # Define full time index range at hourly resolution
    full_index = pd.date_range(start=dataframe.index.min(),
                               end=dataframe.index.max(),
                               freq='h')

    # Create dataframe with full index
    dataframe=dataframe.reindex(full_index)
    dataframe['present'] = ~dataframe[time_index].isna()


    ## WEEKLY VALID/BAD WEEK CALCULATIONS
    
    # Filtering for week
    dataframe['day']=dataframe.index.to_period('D')

    # Finding the total number of valid hours in the week
    actual_counts_daily = dataframe.groupby('day')['present'].sum()

    # Creating boolean index indicating which months have enough data points in them to be valid for avg, min, max calculations
    valid_days = actual_counts_daily[actual_counts_daily >= daily_min_points].index
    bad_days = actual_counts_daily[actual_counts_daily < daily_min_points].index

    
    ## WEEKLY VALID/BAD WEEK CALCULATIONS
    
    # Filtering for week
    dataframe['week']=dataframe.index.to_period('W')

    # Finding the total number of valid hours in the week
    actual_counts_weekly = dataframe.groupby('week')['present'].sum()

    # Creating boolean index indicating which months have enough data points in them to be valid for avg, min, max calculations
    valid_weeks = actual_counts_weekly[actual_counts_weekly >= weekly_min_points].index
    bad_weeks = actual_counts_weekly[actual_counts_weekly < weekly_min_points].index
    

    ## MONTHLY VALID/BAD MONTH CALCULATIONS
    
    # Filtering for months
    dataframe['month']=dataframe.index.to_period('M')
    
    # Finding the number of data points and the total amount of hours in the month
    actual_counts_monthly = dataframe.groupby('month')['present'].sum()

    # Creating boolean index indicating which months have enough data points in them to be valid for avg, min, max calculations
    valid_months = actual_counts_monthly[actual_counts_monthly >= monthly_min_points].index
    bad_months = actual_counts_monthly[actual_counts_monthly < monthly_min_points].index


    ## DAILY, WEEKLY, MONTHLY SUMMARIES FOR TEMPERATURE

    # Had to make it in DatetimeIndex type for some reason (didn't fully understand why...)
    dataframe.index = pd.DatetimeIndex(dataframe.index)
    
    # Resample by day and calculate daily min, max, avg for temperature
    daily_summary_T = dataframe[std_temp_name].resample('D').agg(['mean', 'min', 'max']).reset_index()
    daily_summary_T.columns = [std_time_name, 'T_avg', 'T_min', 'T_max']

    # Had to change back to datetime type for some reason (didn't fully understand why...)
    daily_summary_T[std_time_name] = pd.to_datetime(daily_summary_T[std_time_name])
    
    # Resample by week and calculate monthly min, max, avg for temperature
    weekly_summary_T = daily_summary_T.resample('W', on=std_time_name).agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'}).reset_index() 

    # Creating week start points
    weekly_summary_T['WeekPeriod'] = weekly_summary_T[std_time_name].dt.to_period('W')
    
    # Filtering the rows using valid_weeks (boolean) to only calculations from valid weeks
    weekly_summary_T = weekly_summary_T[weekly_summary_T['WeekPeriod'].isin(valid_weeks)]
    
    # Resample by month and calculate monthly min, max, avg for temperature
    monthly_summary_T = daily_summary_T.resample('ME', on=std_time_name).agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'}).reset_index() 

    # Creating month start points
    monthly_summary_T['MonthPeriod'] = monthly_summary_T[std_time_name].dt.to_period('M')
    
    # Filtering the rows using valid_months (boolean) to only calculations from valid months
    monthly_summary_T = monthly_summary_T[monthly_summary_T['MonthPeriod'].isin(valid_months)]


    ## DAILY, WEEKLY, MONTHLY SUMMARIES FOR RELATIVE HUMIDITY (CORRECTED)
    
    # Resample by day and calculate daily min, max, avg for relative humidity
    daily_summary_rh = dataframe[rh_cor_name].resample('D').agg(['mean', 'min', 'max']).reset_index()
    daily_summary_rh.columns = [std_time_name, 'RH_avg', 'RH_min', 'RH_max']

    # Had to change back to datetime type again for some reason (didn't fully understand why...)
    daily_summary_rh[std_time_name] = pd.to_datetime(daily_summary_rh[std_time_name])

    # Resample by week and calculate monthly min, max, avg for relative humidity
    weekly_summary_rh = daily_summary_rh.resample('W', on=std_time_name).agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'}).reset_index() 

    # Creating week start points
    weekly_summary_rh['WeekPeriod'] = weekly_summary_rh[std_time_name].dt.to_period('W')
    
    # Filtering the rows using valid_weeks (boolean) to only calculations from valid weeks
    weekly_summary_rh = weekly_summary_rh[weekly_summary_rh['WeekPeriod'].isin(valid_weeks)]
    
    # Resample by month and calculate monthly min, max, avg for relative humidity
    monthly_summary_rh = daily_summary_rh.resample('ME', on=std_time_name).agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'}).reset_index()

    # Creating month start points
    monthly_summary_rh['MonthPeriod'] = monthly_summary_rh[std_time_name].dt.to_period('M')

    # Filtering the rows using valid_months (boolean) to only calculations from valid months
    monthly_summary_rh = monthly_summary_rh[monthly_summary_rh['MonthPeriod'].isin(valid_months)]
    
    return data, monthly_summary_T, monthly_summary_rh, weekly_summary_T, weekly_summary_rh, daily_summary_T, daily_summary_rh, bad_months, bad_weeks, bad_days

In [None]:
# Function writing a new .csv in your new folder

def write_csv(basename, dataframe):

    # Creating new file path and name
    new_file_path = files_name + '/PROCESSED_' + os.path.basename(basename)
    new_file_path = new_file_path.replace('txt', 'csv')
    
    # Prints selected file name
    print('Writing new file:', os.path.basename(new_file_path), '\n')
    
    # Renaming labels to standard
    dataframe.rename(columns={time_name: std_time_name,
                              temp_name: std_temp_name,
                              rh_name: std_rh_name,
                              td_name: std_td_name}, inplace=True)
    
    # Removing custom index
    dataframe.reset_index(drop=True, inplace=True)
    
    # Writing the new dataframe to your computer
    dataframe.to_csv(new_file_path, index=False, encoding='utf-8-sig')

In [None]:
# Window selector function

def find_valid_windows(dataframe, column_subset=None, min_window_size=1):

    print('Calculating Data Window(s)\n')

    print('The following time windows contain data, please choose a start and end date for visualization (must be on the hour).\n',
          'You can input \'all\' to visualize everything or eg. \'2003/10/17 21:00:00\'.\n')
    
    # Step 1: Ensure datetime index and sort it
    dataframe = dataframe.set_index(time_index, drop=True)
    dataframe.index = pd.to_datetime(dataframe.index).strftime(std_date_format)
    dataframe.index = pd.DatetimeIndex(dataframe.index)
    
    # Step 2: Compute time differences
    time_diffs = dataframe.index.to_series().diff()
    
    # Step 3: Identify breaks (difference not equal to 1 hour)
    one_hour = pd.Timedelta(hours=1)
    breaks = time_diffs != one_hour
    
    # Step 4: Assign group IDs
    group_id = breaks.cumsum()
    
    # Step 5: Group by ID and extract start/end of each group
    sequences = dataframe.groupby(group_id).apply(lambda g: (g.index[0], g.index[-1]))
    
    # Step 6: Convert to list of (start, end) tuples
    continuous_sequences = sequences.tolist()
    
    # Output the sequences
    ticker = 0
    
    for start, end in continuous_sequences:
        if pd.isnull(start):
            continue
        else:
            #print(f"Start: {start}, End: {end}")
            ticker = ticker + 1
            print('Window', ticker, ': ', start, 'until', end, '\t')
    
    return

In [None]:
# Hourly plotting fuctions

def plot_hourly_temp(filename, data, window_start, window_end):

    print('\nPlotting: Hourly temperature.\n')

    data[std_time_name] = pd.to_datetime(data[std_time_name])
    
    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot hourly
    plt.plot(
        data[std_time_name],
        data[std_temp_name],
        label='Temp', 
        color='black', 
        linewidth=2.5
    )
    
    plt.title('Hourly ' + std_temp_name + ' ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hour_temp.png'))

def plot_hourly_rh(filename, data, window_start, window_end):
    
    print('\nPlotting: Hourly relative humidity.\n')

    data[std_time_name] = pd.to_datetime(data[std_time_name])

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot hourly
    plt.plot(
        data[std_time_name],
        data[rh_cor_name],
        label='Temp', 
        color='black', 
        linewidth=2.5
    )
    
    plt.title('Hourly ' + std_rh_name + ' ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'hour_temp.png'))

In [None]:
# Daily plotting functions

# Daily Temperature Plot
def plot_daily_temp(filename, daily_summary_T, bad_days, window_start, window_end):

    print(daily_summary_T)

    print('\nPlotting: Daily average, minimum, and maximum temperatures.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = daily_summary_T[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = daily_summary_T[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot daily average, min, and max
    plt.plot(
        daily_summary_T[std_time_name], 
        daily_summary_T['T_avg'], 
        label='Average Temp', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        daily_summary_T[std_time_name], 
        daily_summary_T['T_min'], 
        label='Min Temp', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        daily_summary_T[std_time_name], 
        daily_summary_T['T_max'], 
        label='Max Temp', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    
    plt.title('Daily ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_days:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'day_temp.png'))

# Daily Relative Humidity Plot
def plot_daily_rh(filename, daily_summary_rh, bad_days, window_start, window_end):

    print('\nPlotting: Daily average, minimum, and maximum relative humidity.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = daily_summary_rh[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = daily_summary_rh[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot daily average, min, and max RH
    plt.plot(
        daily_summary_rh[std_time_name], 
        daily_summary_rh['RH_avg'], 
        label='Average RH', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        daily_summary_rh[std_time_name], 
        daily_summary_rh['RH_min'], 
        label='Min RH', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        daily_summary_rh[std_time_name], 
        daily_summary_rh['RH_max'], 
        label='Max RH', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Daily ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_days:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig2 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig2.set_size_inches(12, 6)

    fig2.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'day_rh.png'))

In [None]:
# Weekly plotting functions

# Weekly Temperature Plot
def plot_weekly_temp(filename, weekly_summary_T, bad_weeks, window_start, window_end):

    print('\nPlotting: Weekly average, minimum, and maximum temperatures.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = weekly_summary_T[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = weekly_summary_T[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot weekly average, min, and max
    plt.plot(
        weekly_summary_T[std_time_name], 
        weekly_summary_T['T_avg'], 
        label='Average Temp', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        weekly_summary_T[std_time_name], 
        weekly_summary_T['T_min'], 
        label='Min Temp', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        weekly_summary_T[std_time_name], 
        weekly_summary_T['T_max'], 
        label='Max Temp', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    
    plt.title('Weekly ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    # Format tick labels as dates
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_weeks:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'week_temp.png'))

# Weekly Relative Humidity Plot
def plot_weekly_rh(filename, weekly_summary_rh, bad_weeks, window_start, window_end):

    print('\nPlotting: Weekly average, minimum, and maximum relative humidity.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = weekly_summary_rh[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = weekly_summary_rh[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot weekly average, min, and max RH
    plt.plot(
        weekly_summary_rh[std_time_name], 
        weekly_summary_rh['RH_avg'], 
        label='Average RH', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        weekly_summary_rh[std_time_name], 
        weekly_summary_rh['RH_min'], 
        label='Min RH', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        weekly_summary_rh[std_time_name], 
        weekly_summary_rh['RH_max'], 
        label='Max RH', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Weekly ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_weeks:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig2 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig2.set_size_inches(12, 6)

    fig2.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'week_rh.png'))

In [None]:
# Monthly plotting functions

# Monthly Temperature Plot
def plot_monthly_temp(filename, monthly_summary_T, bad_months, window_start, window_end):

    print('\nPlotting: Monthly average, minimum, and maximum temperatures.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = monthly_summary_T[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = monthly_summary_T[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_avg'], 
        label='Average Temp', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_min'], 
        label='Min Temp', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        monthly_summary_T[std_time_name], 
        monthly_summary_T['T_max'], 
        label='Max Temp', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly ' + std_temp_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_temp_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig1 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig1.set_size_inches(12, 6)

    fig1.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_temp.png'))

# Monthly Relative Humidity Plot
def plot_monthly_rh(filename, monthly_summary_rh, bad_months, window_start, window_end):

    print('\nPlotting: Monthly average, minimum, and maximum relative humidity.\n')
    
    # Calculate time differences between consecutive months
    date_diffs = monthly_summary_rh[std_time_name].diff()

    # Identify gaps larger than 1 month
    gaps = monthly_summary_rh[date_diffs > pd.Timedelta(days=31)]

    #plt.figure(figsize=(12, 3))
    plt.figure(figsize=(12, 3))

    # Plot monthly average, min, and max RH
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_avg'], 
        label='Average RH', 
        color='black', 
        linewidth=2.5
    )
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_min'], 
        label='Min RH', 
        color='blue', 
        linestyle='dashdot', 
        linewidth=0.5
    )
    plt.plot(
        monthly_summary_rh[std_time_name], 
        monthly_summary_rh['RH_max'], 
        label='Max RH', 
        color='red', 
        linestyle='dashdot', 
        linewidth=0.5
    )

    plt.title('Monthly ' + std_rh_name + ' Summary ~ ' + window_start + ' - ' + window_end + ' ~ ' + os.path.basename(filename.replace('.csv', '')))
    plt.xlabel(plot_time_label)
    plt.ylabel(plot_rh_label)

    ax = plt.gca()

    window_start = pd.to_datetime(window_start)
    window_end = pd.to_datetime(window_end)
    plt.xlim(window_start, window_end)
    plt.gcf().autofmt_xdate()

    # Shade missing data regions
    for period in bad_months:
        start = period.to_timestamp()
        end = (period + 1).to_timestamp()
        ax.axvspan(start, end, color='gray', alpha=0.3)

    fig2 = plt.gcf()
    
    plt.legend()
    plt.show()

    fig2.set_size_inches(12, 6)

    fig2.savefig(plots_name + '/' + os.path.basename(filename.replace('.csv', '_') + 'month_rh.png'))

In [None]:
# Walks through the folder and goes through each file one at a time.

directory = folder_path
counter = 0
completed = []
skipped = []

# Iterate over files in directory
for path, folders, files in os.walk(directory):
    
    for filename in files:
        '''
        try:
        '''
        counter = counter + 1
        print(counter, '.\n')
        filename = directory + '/' + filename

        # File initialize function        
        data, monthly_summary_T, monthly_summary_rh, weekly_summary_T, weekly_summary_rh, daily_summary_T, daily_summary_rh, bad_months, bad_weeks, bad_days\
        = file_initialize(filename, time_name, rh_name, td_name)
        print('Initialization complete\n- - -\n')
        '''
        # File writing function (complete file)
        write_csv(os.path.basename(filename), data)
        print('Writing complete\n- - -\n')
        
        # Selecting windows function
        find_valid_windows(data)
        '''
        # Asking the user for inputs of the start and end of the window    
        window_start = input('\na) What is your start time? --> ')
        if window_start == 'all':
            window_start = data[std_time_name].index.min()
            window_start = data.loc[window_start, std_time_name]
            print('\nstart:', window_start)
            window_end = data[std_time_name].index.max()
            window_end = data.loc[window_end, std_time_name]
            print('end:', window_end, '\n')

        else:
            window_end = input('b) What is your end time? -->   ')
            print('\n')

        # Asking the user what type of plots they would like to have.
        print('You can make weekly, and monthly plots of your data. Input \'hourly\', \'daily\', \'weekly\', \'monthly\', or \'all\' into the following pop-up.')
        
        plot_type = input('\na) What type of plot would you like? --> ')

        if plot_type == 'hourly':
            # Hourly Plotting
            plot_hourly_temp(filename, data, window_start, window_end)
            plot_hourly_rh(filename, data, window_start, window_end)

        elif plot_type == 'daily':
            # Daily Plotting
            plot_daily_temp(filename, daily_summary_T, bad_days, window_start, window_end)
            plot_daily_rh(filename, daily_summary_rh, bad_days, window_start, window_end)
        
        elif plot_type == 'weekly':
            # Weekly Plotting
            plot_weekly_temp(filename, weekly_summary_T, bad_weeks, window_start, window_end)
            plot_weekly_rh(filename, weekly_summary_rh, bad_weeks, window_start, window_end)

        elif plot_type == 'monthly':
            # Monthly Plotting        
            plot_monthly_temp(filename, monthly_summary_T, bad_months, window_start, window_end)
            plot_monthly_rh(filename, monthly_summary_rh, bad_months, window_start, window_end)

        elif plot_type == 'all':
            # Hourly Plotting
            plot_hourly_temp(filename, data, window_start, window_end)
            plot_hourly_rh(filename, data, window_start, window_end)
            # Daily Plotting
            plot_daily_temp(filename, daily_summary_T, bad_days, window_start, window_end)
            plot_daily_rh(filename, daily_summary_rh, bad_days, window_start, window_end)
            # Weekly Plotting
            plot_weekly_temp(filename, weekly_summary_T, bad_weeks, window_start, window_end)
            plot_weekly_rh(filename, weekly_summary_rh, bad_weeks, window_start, window_end)
            # Monthly Plotting        
            plot_monthly_temp(filename, monthly_summary_T, bad_months, window_start, window_end)
            plot_monthly_rh(filename, monthly_summary_rh, bad_months, window_start, window_end)

        else:
            print('You may have mistyped the input, so we just picked \'all\' for you.')
            # Hourly Plotting
            plot_hourly_temp(filename, data, window_start, window_end)
            plot_hourly_rh(filename, data, window_start, window_end)
            # Daily Plotting
            plot_daily_temp(filename, daily_summary_T, bad_days, window_start, window_end)
            plot_daily_rh(filename, daily_summary_rh, bad_days, window_start, window_end)
            # Weekly Plotting
            plot_weekly_temp(filename, weekly_summary_T, bad_weeks, window_start, window_end)
            plot_weekly_rh(filename, weekly_summary_rh, bad_weeks, window_start, window_end)
            # Monthly Plotting        
            plot_monthly_temp(filename, monthly_summary_T, bad_months, window_start, window_end)
            plot_monthly_rh(filename, monthly_summary_rh, bad_months, window_start, window_end)
                
        print('Plotting complete\n')
        
        completed.append(os.path.basename(filename))

        '''
        except:
            print('An error occured. File skipped.\n')
            skipped.append(os.path.basename(filename))
        '''

print('---------------------------\n\nPROCESSING INFORMATION\n')
print('Files processed:\n', completed, '\n')
print('Files skipped due to error:\n', skipped)