# Convert EXOsonde CSV files to parquet

In [6]:
import os
import re
from datetime import datetime
from pprint import pprint
import pytz
import numpy as np
import pandas as pd

In [7]:
srcdir = 'data/src/exo'
dstdir = 'data/dst'

In [8]:
# ** sonde data ***
ext = '.csv'

files = [file for file in os.listdir(srcdir) if file.endswith(ext)]
print(f'Found {len(files)} files:')
pprint(files)

Found 1 files:
['LCCD_Little_Conestoga_Sonde_Data_May_2023_-_July_202.csv']


In [9]:
col_labels = {
    'DO'   : 'Dissolved Oxygen (mg/l)',
    'TEMP' : 'Water Temperature (C)',
    'SC'   : 'Specific Conductivity (uS/cm)',
    'pH'   : 'pH',
    'TDS'  : 'Total Dissolved Solids (mg/l)'
}

In [10]:
'''
for ix, row in raw.iterrows():
    # using a regex split
    (month, day, year, hour, minute) = tuple(map(int, re.split('/| |:', row.TimeStamp)))
    dt = datetime(year, month, day, hour, minute)
    print(f'timestamp: {dt}, {type(dt)}')
'''


def parse_timestamp(ds):
    (month, day, year, hour, minute) = tuple(map(int, re.split('/| |:', ds)))
    dt = datetime(year, month, day, hour, minute)
    #print(f'timestamp: {dt}, {type(dt)}')
    
    return dt

def read_exo(fn, srcdir='.', to_freq='15min'):
    
    df = pd.read_csv(f'{srcdir}/{fn}', comment='#', parse_dates=False)
    df['DateTimeLocal'] = df.apply(lambda x:parse_timestamp(x.TimeStamp), axis=1)
    df['DateTimeEST'] = df['DateTimeLocal'].dt.tz_localize('EST')

    df.set_index(df['DateTimeEST'], inplace=True)
    df.index.rename('DateTime', inplace=True)

    in_freq = pd.infer_freq(df.index)
    df = df.resample('1min').first().interpolate().resample(to_freq).asfreq()
    df.index = df.index.tz_convert(pytz.utc)

    return df


for file in files:
    df = read_exo(file, srcdir=srcdir, to_freq='15min')
    basename = os.path.splitext(file)[0]
    print(f'Writing: {basename}')

    df.to_parquet(f'{dstdir}/{basename}.parquet', index=True)


  df = df.resample('1min').first().interpolate().resample(to_freq).asfreq()


Writing: LCCD_Little_Conestoga_Sonde_Data_May_2023_-_July_202


In [11]:
df.head()

Unnamed: 0_level_0,TimeStamp,Dissolved Oxygen (mg/l),Water Temperature (C),Specific Conductivity (uS/cm),pH,Total Dissolved Solids (mg/l),DateTimeLocal,DateTimeEST
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2023-05-01 23:15:00+00:00,,,,,,,NaT,NaT
2023-05-01 23:30:00+00:00,,10.354,13.5046,,8.023333,,2023-05-01 18:30:00,2023-05-01 18:30:00-05:00
2023-05-01 23:45:00+00:00,,10.479333,13.806267,635.993333,8.017333,413.466667,2023-05-01 18:45:00,2023-05-01 18:45:00-05:00
2023-05-02 00:00:00+00:00,,10.475333,13.8556,637.013333,8.034667,414.0,2023-05-01 19:00:00,2023-05-01 19:00:00-05:00
2023-05-02 00:15:00+00:00,,10.455333,13.917733,638.52,8.04,414.733333,2023-05-01 19:15:00,2023-05-01 19:15:00-05:00


In [None]:
# if no duplicates go ahead and set the frequency
wx_df.set_index(wx_df['DateTime'], inplace=True)

# leave the frequency alone for now...
#in_freq = pd.infer_freq(wx_df.index)

#wx_df = wx_df.asfreq(freq=in_freq)
wx_df = wx_df.asfreq(freq='15min')

# maybe resample later
# wx_df = wx_df.resample('1min').first().interpolate().resample(to_freq).asfreq()
# wx_df = wx_df.resample('15min').first()

# update the index to UTC
wx_df.index = wx_df.index.tz_convert(pytz.utc)

In [None]:
wx_df.index

In [None]:
# write to parquet file
print(f'Writing: wx_data.parquet')
wx_df.to_parquet(f'{dstdir}/wx_data.parquet', index=True)