In [8]:
import os
import pandas as pd

# Initialize an empty list to store data for the final DataFrame
headers_data = []

# Specify the parent directory containing the 'met_towers_2017-2022_' directories
parent_directory = "/Volumes/55AE_SSD/nsrd_local/data/"

# Loop through directories in the parent directory
for directory_name in os.listdir(parent_directory):
    if directory_name.startswith('met_towers_2017-2022_') and "original" not in directory_name:
        directory_path = os.path.join(parent_directory, directory_name)
        
        if os.path.isdir(directory_path):
            # Loop through CSV files in the current directory
            for file_name in os.listdir(directory_path):
                if file_name.startswith('.'):  # Skip hidden files
                    continue
                
                if file_name.endswith('.csv'):
                    file_path = os.path.join(directory_path, file_name)
                    
                    # Try different encodings
                    for encoding in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
                        try:
                            # Read the file and retrieve its headers and dtypes
                            df = pd.read_csv(file_path, nrows=10, encoding=encoding)
                            
                            for column in df.columns:
                                # Remove '_Height' from the column name
                                shortname = column.split('_')[0]
                                dtype = df[column].dtype

                                # Append the information for this column to the headers_data list
                                headers_data.append({
                                    "header_shortname": shortname,
                                    "header_longname": None,  # Placeholder for manual entry
                                    "header_description": None,  # Placeholder for manual entry
                                    "unit": None,  # Placeholder for manual entry
                                    "dtype": dtype
                                })
                            
                            # Break out of the encoding loop if successful
                            break
                        except Exception as e:
                            # If all encodings fail, log the error
                            if encoding == ['cp1252'][-1]:
                                print(f"Error reading file {file_path}: {e}")
                            continue

# Convert the headers_data list to a DataFrame
headers_df = pd.DataFrame(headers_data)

# Drop duplicate rows
headers_df = headers_df.drop_duplicates()

# Save the DataFrame to a CSV file for review and manual entry
output_file = "/Volumes/55AE_SSD/nsrd_local/data/metadata.csv"
headers_df.to_csv(output_file, index=False)

In [17]:
import os
import pandas as pd

# List of names to check for in column names
names = [
    "DT", "MixHeight", "MixRatio", "SatMixRatio", "SatVaporPres", "SonicTemp", "StabSigPhi", "StabSRDT", 
    "UWind", "VaporPres", "VWDir", "VWind", "WElev", "SWSpdMph", "SWSpdMs", "RWSpdMph", "RWSpdMs", 
    "RSWSpdMph", "RSWSpdMs", "SPkWSpdMph", "SPkWSpdMs", "SWDir", "SVWDir", "SSigma", "DT002m015m", 
    "DT002m035m", "DT002m060m", "DT015m035m", "DT015m060m", "SoilTemp", "Ri015m035m", "Ri015m060m", 
    "Stab", "DeltaTemp", "LHV"
]

# Initialize an empty dictionary to store statistics
stats_data = {}

# Specify the parent directory containing the 'met_towers_2017-2022_' directories
parent_directory = "/Volumes/55AE_SSD/nsrd_local/data/"

# Loop through directories in the parent directory
for directory_name in os.listdir(parent_directory):
    if directory_name.startswith('met_towers_2017-2022_') and "original" not in directory_name:
        directory_path = os.path.join(parent_directory, directory_name)
        
        if os.path.isdir(directory_path):
            # Loop through CSV files in the current directory
            for file_name in os.listdir(directory_path):
                if file_name.startswith('.'):  # Skip hidden files
                    continue
                
                if file_name.endswith('.csv'):
                    file_path = os.path.join(directory_path, file_name)
                    
                    # Try different encodings
                    for encoding in ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']:
                        try:
                            # Read the file and retrieve its data
                            df = pd.read_csv(file_path, encoding=encoding)
                            
                            # Loop through each column in the DataFrame
                            for column in df.columns:
                                # Check if the column name contains any string from the names list
                                if any(name in column for name in names):
                                    # Extract non-NaN values from the column
                                    non_nan_values = df[column].dropna()

                                    # If there are any non-NaN values, calculate statistics
                                    if not non_nan_values.empty:
                                        col_min = non_nan_values.min()
                                        col_mean = non_nan_values.mean()
                                        col_max = non_nan_values.max()

                                        # Store the statistics in the dictionary
                                        if column not in stats_data:
                                            stats_data[column] = []
                                        stats_data[column].append({
                                            "file": file_name,
                                            "min": col_min,
                                            "mean": col_mean,
                                            "max": col_max
                                        })
                            
                            # Break out of the encoding loop if successful
                            break
                        except Exception as e:
                            # If all encodings fail, log the error
                            if encoding == ['cp1252'][-1]:
                                print(f"Error reading file {file_path}: {e}")
                            continue

# Convert the stats dictionary into a DataFrame for export
stats_flattened = []
for column, stats in stats_data.items():
    for stat in stats:
        stats_flattened.append({"column": column, **stat})

In [18]:
print(stats_flattened)

[{'column': 'DT_030m', 'file': 'TOWA_2017-2022_hourly-qa.csv', 'min': -2.3, 'mean': 0.17159481073059923, 'max': 6.6967134}, {'column': 'DT_030m', 'file': 'TOWB_2017-2022_hourly-qa.csv', 'min': -2.0178294, 'mean': 0.11550984568962269, 'max': 6.6312594}, {'column': 'MixHeight_015m', 'file': 'TOWA_2017-2022_hourly-qa.csv', 'min': -681.2513, 'mean': 770.2438727005639, 'max': 8670.024}, {'column': 'MixHeight_015m', 'file': 'TOWB_2017-2022_hourly-qa.csv', 'min': -681.2513, 'mean': 771.8739605817092, 'max': 8670.024}, {'column': 'MixHeight_015m', 'file': 'TOWY_2017-2022_hourly-qa.csv', 'min': -713.1427, 'mean': 775.1337284419441, 'max': 8679.221}, {'column': 'MixRatio_015m', 'file': 'TOWA_2017-2022_hourly-qa.csv', 'min': -0.0007613423, 'mean': 0.009108443116615662, 'max': 0.03215416}, {'column': 'MixRatio_015m', 'file': 'TOWD_2017-2022_hourly-qa.csv', 'min': -0.004813, 'mean': 0.009036792718863726, 'max': 0.078047}, {'column': 'MixRatio_015m', 'file': 'TOWY_2017-2022_hourly-qa.csv', 'min': 2.