In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt
import scheme_mar2023 as scheme

In [None]:
plt.style.use('default')   # try another styles: 'classic', Solarize_Light2

In [None]:
df18 = pd.read_csv('G:/1_Data1/sea/BaikalJun2018Sea.dat',
                 sep='\t',
                 decimal=',',
                 na_values='--',
                 skiprows=[0,2],
                 #nrows=1000,
                 parse_dates=['DateTime'],
                 dayfirst=True,
                )
df18.columns = scheme.convert_names(df18.columns)

In [None]:
df21 = pd.read_csv('G:/1_Data1/sea/BaikalJun2021Sea.dat',
                    sep='\t',
                    decimal=',',
                    na_values='--',
                    skiprows=[0,2,3],
                    #nrows=1000,
                    parse_dates=['DateTime'],
                    dayfirst=True,
                  )
df21.columns = scheme.convert_names(df21.columns)

In [None]:
df22 = pd.read_csv('G:/1_Data1/sea/BaikalJun2022Sea.dat',
                    sep='\t',
                    decimal=',',
                    na_values='--',
                    skiprows=[0,2,3],
                    #nrows=1000,
                    parse_dates=['DateTime'],
                    dayfirst=True,
                  )
df22.columns = scheme.convert_names(df22.columns)

In [None]:
df23 = pd.read_csv('G:/1_Data1/sea/BaikalJun2023Sea.dat',
                    sep='\t',
                    decimal=',',
                    na_values='--',
                    skiprows=[0,2,3],
                    #nrows=1000,
                    parse_dates=['DateTime'],
                    dayfirst=True,
                  )
df23.columns = scheme.convert_names(df23.columns)

In [None]:
df24 = pd.read_csv('G:/1_Data1/sea/BaikalJun2024Sea.dat',
                    sep='\t',
                    decimal='.',
                    na_values='--',
                    skiprows=[1,2,3],
                    #nrows=1000,
                    parse_dates=['DateTime'],
                    dayfirst=False,  # carefully about date format
                  )
df24.columns = scheme.convert_names(df24.columns)

In [None]:
df = pd.concat([df18, df21, df22, df23, df24], ignore_index=True, axis='index')
df

In [None]:
# look for duplicates
cols = df.columns
import collections
[item for item, count in collections.Counter(cols).items() if count > 1]

In [None]:
cols_we_need = ['DateTime', 'Longitude', 'Latitude',
                'pCO2wtr', 'cCO2wtr', 'pCH4wtr', 'cCH4wtr', 'pCO2air', 'pCH4air',
                'dpCO2', 'dpCH4',
                'vCO2air', 'vCH4air', 
                'ChnN', 'Channel',
                'PressAir',
                'Piro', 'PiroUV', 'LightLX', 'LightUV', 'PiroVisio',
                'Wnd', 'WindSpeed', 
                'TempAir',
                'Precipi', 'Precipitation',
                'AirFlow',
                'TempEqu1', 'TempEqu2', 'TempEqu3', 'Twtr',
                'WaterFlow1', 'WaterFlow2', 'WaterFlow3',
               ]
df = df.loc[:, cols_we_need]

In [None]:
df.dropna(subset=['DateTime', 'Longitude', 'Latitude'], axis='index', inplace=True)
df.reset_index(drop=True, inplace=True)
df

In [None]:
## DateTime 
df.sort_values(by='DateTime', inplace=True)
df['year'] = df['DateTime'].dt.year
df['time'] = df['DateTime'].dt.hour+df['DateTime'].dt.minute/60
df['DateTimeSec'] = df['DateTime'].astype('int64')/10**9
df['DateTimeSec'].astype('int32')
df['dt'] = df['DateTimeSec'] - df['DateTimeSec'].shift(1)

In [None]:
#dft = df.loc[df['year']==2021, :].copy()
dft = df.copy()
dft['WaterFlow'] = dft['WaterFlow1'].rolling(20).mean()
dft['dWaterFlow'] = (dft['WaterFlow'] - dft['WaterFlow'].shift(1))/dft['dt']
dft.loc[dft['dWaterFlow'] > 0.004, ['dWaterFlow']] = np.nan
dft.loc[dft['dWaterFlow'] < -0.004, ['dWaterFlow']] = np.nan

In [None]:
plt.rcParams['figure.figsize'] = [17, 8]
fig, (ax1, ax2, ax3) = plt.subplots(3)
fig.suptitle('Horizontally stacked subplots')
ax1.scatter('DateTime', 'WaterFlow1', data=dft, s=2, alpha=0.1, c='red')
ax1.grid()
ax2.scatter('DateTime', 'dWaterFlow', data=dft, s=2, alpha=0.1)
#ax2.xlim(-0.0005, 0.0005)
ax2.grid()
ax3.scatter('DateTime', 'cCO2wtr', data=dft, s=2, alpha=0.1)
ax3.grid()

In [None]:
dftt = dft.loc[dft['dWaterFlow'] < 1.000].copy()
dfg = dftt.groupby(by=dftt['DateTime'].dt.hour).agg('mean','std')
dfg=dfg.reset_index(drop=False)

In [None]:
plt.scatter('time', 'cCO2wtr', data=dftt, s=2, c='lime', )
plt.scatter('DateTime', 'cCO2wtr', data=dfg, c='red')
plt.ylim(0.5, 1.5)
plt.grid()