This script:
1. Reads multiple CSV files from weather station directories
2. Processes daily or monthly observations
3. Uploads combined data to PostgreSQL

In [6]:
import os
import csv
import psycopg2
from psycopg2.extras import execute_values
import time

In [7]:
data_folder = "../data/historical_weather/data"
target_table = "weather_monthly" # or "weather_daily"
temp_resolution = "_monthly.csv" # or "_daily.csv"
conn_string = "postgresql://lizavabistsevits:@localhost:5432/taiwan" # database connection string

column_names = {
	"station_code" : "station",
	"obs_date" : "obs_date",
	"Tx" : "tavg",
	"TxMaxAbs" : "tmax",
	"TxMinAbs" : "tmin",
	"Td" : "tdp",
	"RH" : "rel_humidity",
	"WS" : "wind_speed",
	"Precp" : "precip"
}

In [8]:
# Get a list of all station codes that exist in the database
with psycopg2.connect(conn_string) as conn:
    with conn.cursor() as cursor:
        cursor.execute("SELECT code FROM stations")
        station_codes = [row[0] for row in cursor.fetchall()]

print("Existing station codes: ", station_codes)
print("Total number of unique station codes: ", len(station_codes))

Existing station codes:  ['01A171', '01A211', '01A411', '01A421', '01A431', '01A441', '01B031', '01C401', '12J990', '12Q970', '12Q980', '21C071', '21C081', '21C091', '21C141', '21D151', '21D161', '21D171', '21D351', '21U111', '42HA10', '460010', '460020', '466850', '466880', '466881', '466900', '466910', '466920', '466921', '466930', '466940', '466950', '466990', '467050', '467060', '467080', '467110', '467270', '467280', '467290', '467300', '467350', '467410', '467411', '467420', '467440', '467441', '467480', '467490', '467530', '467540', '467550', '467570', '467571', '467590', '467610', '467620', '467650', '467660', '467770', '467780', '467790', '467990', '72AI40', '72C440', '72D080', '72D680', '72G600', '72HA00', '72K220', '72L140', '72M360', '72M700', '72N100', '72N240', '72Q010', '72S200', '72S590', '72T250', '72U480', '72V140', '82A750', '82C160', '82H320', '82H840', '82S580', 'A2C560', 'A2K360', 'A2K570', 'A2K630', 'A2N290', 'A2Q950', 'B2E890', 'B2N890', 'B2Q810', 'B2U990', 'C0A

In [9]:
def get_station_files(data_dir):
    """
    Find all station directories and their CSV files.
    Returns a list of tuples: (station_code, file_path)
    """
    station_files = []
    skipped_stations = 0
    
    # Traverse the main data directory
    for station_dir in os.listdir(data_dir):
        station_path = os.path.join(data_dir, station_dir)
        
        # Skip if not a directory
        if not os.path.isdir(station_path):
            continue
            
        # Get station code from directory name
        station_code = station_dir
        
        # Skip if it is not in the station list
        if station_code not in station_codes:
            skipped_stations += 1
            continue
        
        # Find all CSV files in the station directory
        for filename in os.listdir(station_path):
            if filename.endswith(temp_resolution):
                file_path = os.path.join(station_path, filename)
                station_files.append((station_code, file_path))
    
    #print(f"Skipped {skipped_stations} stations.")        
    
    return station_files

def process_csv_file(station_code, file_path):
    """
    Process a single CSV file and return its data as a list of dictionaries.
    Handles the specific column mapping and adds station_code to each record.
    """
    data = []
    
    with open(file_path, 'r', newline='') as csv_file:
        
        reader = csv.reader(csv_file, delimiter=',')
        
        # Get headers
        headers = next(reader)
        #print("Headers: ", headers)
        
        # Process each row
        for row in reader:
            if not row:  # Skip empty rows
                continue
                
            # Create record with renamed columns
            record = {'station_code': station_code}

            # Add date as first column
            record['obs_date'] = row[0]
            
            # Add remaining columns
            for i, value in enumerate(row[1:], 1):
                if i < len(headers):
                    orig_col = headers[i]
                    if orig_col in column_names:
                        # Handle  NULL values such as -9999.5, -99.5, -9995 etc
                        if value == "" or value.startswith("-99"):
                            record[orig_col] = None
                        else:
                            record[orig_col] = value

            #print("Record:", record)
            
            data.append(record)
            
    return data

def insert_to_postgres(conn, data, table_name):
    """
    Upload data to PostgreSQL using the INSERT command with execute_values.

        conn: PostgreSQL connection
        data: List of dictionaries containing the data
        table_name: Target PostgreSQL table name
    
    Returns number of records inserted.
    """
    if not data:
        return 0
        
    # Get PostgreSQL column names from mapping
    pg_columns = []
    for csv_col, pg_col in column_names.items():
        pg_columns.append(pg_col)
        
    # Prepare data for insertion
    rows = []
    for record in data:
        row = []
        for csv_col in column_names.keys():
            row.append(record.get(csv_col, None))
        rows.append(row)
    
    # Execute INSERT command using execute_values for efficiency
    cursor = conn.cursor()
    
    # Build the INSERT query
    columns_str = ', '.join(pg_columns)
    query = f"INSERT INTO {table_name} ({columns_str}) VALUES %s"

    try:
        execute_values(cursor, query, rows)
    except:
        raise
    finally:
        conn.commit()
        cursor.close()
    return len(data)

def process_weather_data(data_dir=data_folder, table_name=target_table, 
                        connection_string=conn_string, batch_size=10000):
    """
    Main function to process weather data and upload to PostgreSQL.
    """
    start_time = time.time()
    
    # Connect to PostgreSQL
    conn = psycopg2.connect(connection_string)
    print("Connected to PostgreSQL database")
    
    # Get all station files
    print(f"Scanning data directory: {data_dir}")
    station_files = get_station_files(data_dir)
    print(f"Found {len(station_files)} station CSV files")
    
    # Process files
    total_records = 0
    batch_data = []
    
    # Simple progress tracking
    total_files = len(station_files)
    print(f"Processing {total_files} files...")
    
    for i, (station_code, file_path) in enumerate(station_files):
        # Show progress periodically
        if (i+1) % 10 == 0 or i == 0 or i == total_files-1:
            print(f"Processing file {i+1}/{total_files} ({(i+1)/total_files*100:.1f}%) - Records so far: {total_records}")
        
        # Process the CSV file
        file_data = process_csv_file(station_code, file_path)
        batch_data.extend(file_data)
        
        # Upload in batches to reduce memory usage
        if len(batch_data) >= batch_size:
            inserted = insert_to_postgres(conn, batch_data, table_name)
            total_records += inserted
            batch_data = []
    
    # Insert any remaining records
    if batch_data:
        inserted = insert_to_postgres(conn, batch_data, table_name)
        total_records += inserted
    
    # Report summary
    total_time = time.time() - start_time
    print("\n----- SUMMARY -----")
    print(f"Total files processed: {len(station_files)}")
    print(f"Total records inserted: {total_records:,}")
    print(f"Processing time: {total_time:.2f} seconds")
    print(f"Average speed: {total_records/total_time:.2f} records/second")
    
    # Close connection
    conn.close()
    print("PostgreSQL connection closed")
    
    return total_records

In [10]:
# Run the weather station data processor
process_weather_data()

Connected to PostgreSQL database
Scanning data directory: ../data/historical_weather/data
Found 18266 station CSV files
Processing 18266 files...
Processing file 1/18266 (0.0%) - Records so far: 0
Processing file 10/18266 (0.1%) - Records so far: 0
Processing file 20/18266 (0.1%) - Records so far: 0
Processing file 30/18266 (0.2%) - Records so far: 0
Processing file 40/18266 (0.2%) - Records so far: 0
Processing file 50/18266 (0.3%) - Records so far: 0
Processing file 60/18266 (0.3%) - Records so far: 0
Processing file 70/18266 (0.4%) - Records so far: 0
Processing file 80/18266 (0.4%) - Records so far: 0
Processing file 90/18266 (0.5%) - Records so far: 0
Processing file 100/18266 (0.5%) - Records so far: 0
Processing file 110/18266 (0.6%) - Records so far: 0
Processing file 120/18266 (0.7%) - Records so far: 0
Processing file 130/18266 (0.7%) - Records so far: 0
Processing file 140/18266 (0.8%) - Records so far: 0
Processing file 150/18266 (0.8%) - Records so far: 0
Processing file 1

207763