In [17]:
# Welcome to the Lahmas Lab Lascar Data Processor!
# Please answer the questions below and run your code directly in a jupyter notebook.

# Where is the folder you would like to process?
folder_path = r'hydroshare_data' # You can 'copy file path' of the folder in your file manager.

# What are the labels for each variable in the .csv files?
time_name = 'Datetime' # Label of time variable
temp_name = 'Temperature (�C)' # Label of temperature variable
rh_name = 'RH (%)' # Label of relative humidity variable
td_name = 'Dew Point (�C)' # Label of dew point temperature variable

# What will the name be of your new folder?
new_folder_name = 'TestNewFolder' # Don't use spaces or special characters.

# Where would you like this folder to be found after processing?
new_folder_path = r'hydroshare_data' # You can 'copy file path' of a folder in your file manager.

In [18]:
# Importing all necessary modules

import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.dates as mdates
from datetime import datetime
import io

In [19]:
# Creating the new folder where all the processed files will end up.

new_path = new_folder_path + '//'  + new_folder_name

if os.path.exists(new_path):
    print('\nThis folder already exists!\n\nIf you wish to continue with this folder anyway, \
run the next block.\nOtherwise, rewrite the folder path/name in block 1.\n')

else:
    os.makedirs(new_path)
    print('\nYour new folder path is:\n"', new_path, '"\n\nRun next block\n')


This folder already exists!

If you wish to continue with this folder anyway, run the next block.
Otherwise, rewrite the folder path/name in block 1.



In [20]:
# File initialization function

def file_initialize(file_path, time_name, rh_name, td_name):

    ### INITIALIZATION
    
    # Prints selected file name
    print("Initializing file:", os.path.basename(file_path), '\n')

    # Converting .csv in ANSI encoding to UTF-8 encoding
    #df = pd.read_csv(file_path, encoding='ANSI')
    # df.to_csv(file_path, encoding='utf-8', index=False)
        
    # Creates dataframe from .csv
    dataframe = pd.read_csv(file_path)
    
    # Skips first values as they may have been taken & tainted during installation
    dataframe = dataframe.iloc[5:].reset_index(drop=True)
    
    # Changes time string to datetime type
    dataframe['Time_fixed'] = pd.to_datetime(dataframe[time_name])
    
    # Define the labels of the corrected variables
    rh_cor_name = 'RH Corrected (%)'
    td_cor_name = 'Dew Point Corrected (°C)'
    
    
    ## RH and Td correction
    
    # Creating lists to insert corrected RH and Td values
    RH_cor = [None] * len(dataframe)
    Td_cor = [None] * len(dataframe)
    
    # Iterating through rows to update RH values out of 0-100% range and Td values.
    for i in range(0,len(dataframe)):
        if dataframe.loc[i, rh_name] > 100:
            RH_cor[i] = 100 # RH is adjusted to 100% as it must be saturated
            Td_cor[i] = dataframe.loc[i, temp_name] # Td is equal to T
        
        else: # Everything stays the same
            RH_cor[i] = dataframe.loc[i, rh_name]
            Td_cor[i] = dataframe.loc[i, td_name]
            
    # Creating columns for the lists to merge into the dataframe
    dataframe[rh_cor_name] = RH_cor
    dataframe[td_cor_name] = Td_cor
    
    
    ## Initializing daily and monthly temperature averages for plotting
    
    # Create index using the time column
    dataframe = dataframe.sort_values('Time_fixed')
    dataframe = dataframe.set_index('Time_fixed', drop=False)
    
    # Resample by day and calculate daily min, max, avg for temperature
    daily_summary_T = dataframe[temp_name].resample('1D')\
        .agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_T.columns = ['Date', 'T_avg', 'T_min', 'T_max']
    
    daily_summary_T['DateTime'] = pd.to_datetime(daily_summary_T['Date'])
    daily_summary_T = daily_summary_T.set_index('Date')
    monthly_summary_T = daily_summary_T.resample('ME')\
        .agg({'T_avg': 'mean','T_min': 'min','T_max': 'max'})\
            .dropna().reset_index()
            
            
    ## Initializing daily and monthly relative humidity averages for plotting
            
    # Resample by day and calculate daily min, max, avg for RH
    daily_summary_rh = dataframe[rh_cor_name].resample('1D')\
        .agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_rh.columns = ['DateTime', 'RH_avg', 'RH_min', 'RH_max']
    
    # Resample by day and calculate daily min, max, avg for RH
    daily_summary_rh = dataframe[rh_cor_name].resample('1D')\
        .agg(['mean', 'min', 'max']).dropna().reset_index()
    daily_summary_rh.columns = ['DateTime', 'RH_avg', 'RH_min', 'RH_max']
    
    daily_summary_rh['DateTime'] = pd.to_datetime(daily_summary_rh['DateTime'])
    daily_summary_rh = daily_summary_rh.set_index('DateTime', drop=False)
    monthly_summary_rh = daily_summary_rh.resample('ME')\
        .agg({'RH_avg': 'mean','RH_min': 'min','RH_max': 'max'})\
            .dropna().reset_index()

    return dataframe


In [21]:
# Function writing a new .csv in your new folder

def write_csv(basename, dataframe):

    # Creating new file path and name
    new_file_path = new_path + '//PROCESSED_' + os.path.basename(basename)
    
    # Prints selected file name
    print('Writing new file:', os.path.basename(new_file_path), '\n')

    # Renaming labels to standard
    dataframe.rename(columns={time_name: 'Datetime (MM/DD/YYYY HR:MN)', 
                              temp_name: 'Temperature (°C)',
                              rh_name: 'RH (%)',
                              td_name: 'Dew Point (°C)'}, inplace=True)
    
    # Removing custom index
    dataframe.reset_index(drop=True, inplace=True)
    
    # Removing unnecessary columns
    del dataframe['Time_fixed']
    
    # Writing the new dataframe to your computer
    dataframe.to_csv(new_file_path, index=False, encoding='utf-8-sig')

In [24]:
# Walks through the folder and goes through each file one at a time.

directory = folder_path
counter = 0
completed = []
skipped = []

# Iterate over files in directory
for path, folders, files in os.walk(directory):
    
    for filename in files:
        
        try:
            counter = counter + 1
            print(counter, '.\n')
            filename = directory + '//' +filename
            data = file_initialize(filename, time_name, rh_name, td_name)
            print('Initialization complete\n')
            write_csv(os.path.basename(filename), data)
            completed.append(os.path.basename(filename))

        except:
            print('An error occured. File skipped.\n')
            skipped.append(os.path.basename(filename))

print('---------------------------\n\nFOLDER TRAVERSED\n')
print('Files processed:\n', completed, '\n')
print('Files skipped due to error:\n', skipped)

1 .

Initializing file: LLanUp-3_17Jul06_9Aug16_4355m.csv 

Initialization complete

Writing new file: PROCESSED_LLanUp-3_17Jul06_9Aug16_4355m.csv 

2 .

Initializing file: LlanUp-1A_18Jul15_5Jul2019.csv 

Initialization complete

Writing new file: PROCESSED_LlanUp-1A_18Jul15_5Jul2019.csv 

3 .

Initializing file: LlanUp-1_17Jul06_23Jul15_3955m.csv 

Initialization complete

Writing new file: PROCESSED_LlanUp-1_17Jul06_23Jul15_3955m.csv 

4 .

Initializing file: LlanUp-2A_18Jul15_5Jul2019.csv 

Initialization complete

Writing new file: PROCESSED_LlanUp-2A_18Jul15_5Jul2019.csv 

5 .

Initializing file: LlanUp-2_17Jul06_28Jun14_4122m.csv 

Initialization complete

Writing new file: PROCESSED_LlanUp-2_17Jul06_28Jun14_4122m.csv 

6 .

Initializing file: LlanUp-3A_18Jul15_5Jul2019.csv 

Initialization complete

Writing new file: PROCESSED_LlanUp-3A_18Jul15_5Jul2019.csv 

7 .

Initializing file: LlanUp-4A_9Jul18_5Jul2019.csv 

An error occured. File skipped.

8 .

Initializing file: LlanUp-