In [11]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [39]:
data = {'Port':['La Coruna', 'Lisbon', 'Cadiz', 'Las Palmas', 'Willemstad',
                'Port Royal', 'Havana', 'Nassau', 'Miami', 'New York',
                'New Port', 'Horta', 'Rio'], 'Date from':['28.08.2021', '02.09.2021', '09.09.2021',
                '30.09.2021', '04.11.2021', '13.11.2021', '24.11.2021', '02.12.2021', '07.12.2021',
                '18.12.2021', '05.01.2022', '22.01.2022', '23.02.2022'],
                'Date to':['29.08.2021', '05.09.2021', '16.09.2021', '04.10.2021', '08.11.2021','17.11.2021',
                '28.11.2021', '05.12.2021', '10.12.2021', '04.01.2022', '08.01.2022', '24.01.2022', '26.02.2022']}

df_ports = pd.DataFrame(data)

df_ports['Date from'] = pd.to_datetime(df_ports['Date from'], dayfirst=True, format='%d.%m.%Y')
df_ports['Date to'] = pd.to_datetime(df_ports['Date to'], dayfirst=True, format='%d.%m.%Y')

def remove_ports_df(ports, df):
    df['date'] = pd.to_datetime(df['date'])
    for idx, row in ports.iterrows():
        from_date = row['Date from']
        to_date = row['Date to']
        df = df[(df['date'] < from_date) | (df['date'] > to_date)]
        
    return df

In [13]:
#Takes list of path names as input, concats files and outputs two dataframes (echos and sensors)
def load_files(echo_files, sensor_files):
    echos = pd.DataFrame()
    sensors = pd.DataFrame()
    for path in echo_files:
        dfe = pd.read_csv(path)
        echos = pd.concat([echos, dfe])
        
    for path in sensor_files:
        dfs = pd.read_csv(path).reset_index()   
        sensors = pd.concat([sensors, dfs], ignore_index=True)
    
    sensors = sensors.drop(columns=['index'])
    return echos, sensors

def sensor_for_log_match(df_sensors):
    df_sensors.dataid = pd.to_datetime(df_sensors.dataid, format='%Y-%m-%d %H:%M:%S')
    df_sensors = df_sensors.groupby(pd.Grouper(key="dataid", freq="30T")).median()
    df_sensors = df_sensors.reset_index()
    df_sensors['date'] = df_sensors['dataid'].dt.date
    df_sensors.to_csv('sensors_log_match.csv', index = False)
    return df_sensors

In [40]:
echo, sensor = load_files(['leg2_echo_prepped.csv', 'leg3_echo_prepped.csv','leg4_echo_prepped.csv'],
           ['leg2_prepped.csv', 'leg3_prepped.csv','leg4_prepped.csv'])

fug = pd.read_csv('pco2.csv', sep=';')
df_fug = fug.copy()[["DATETIME", "FCO2_RECOMMENDED"]]
df_fug.rename(columns = {'DATETIME':'dataid', 'FCO2_RECOMMENDED':'FCO2'}, inplace = True)

In [52]:
def combine_files(df_echo, df_sensors, df_fug, filename):
    # datetime type is converted as object when saved to csv, convert back to datetime after reading
    df_echo.dataid = pd.to_datetime(df_echo.dataid, format='%Y-%m-%d %H:%M:%S')
    df_sensors.dataid = pd.to_datetime(df_sensors.dataid, format='%Y-%m-%d %H:%M:%S')
    df_fug.dataid = pd.to_datetime(df_fug.dataid, format='%d/%m/%Y %H:%M')
    
    # group in intervals of 30 min
    df_sensors = df_sensors.groupby(pd.Grouper(key="dataid", freq="30T")).median()
    df_sensors = df_sensors.reset_index()
    df_fug = df_fug.groupby(pd.Grouper(key="dataid", freq="30T")).median()
    df_fug = df_fug.reset_index()

    # join sensor and echo dataframes
    final_df = df_echo.merge(df_sensors, on='dataid', how='outer')
    final_df = final_df.merge(df_fug, on='dataid', how='outer')
    
   
    # Split datetime to date and time separately
    # Date to integer YYYY-MM-DD
    final_df['date'] = final_df['dataid'].dt.date
    final_df['day']=final_df['dataid'].dt.date.apply(lambda x: int(x.strftime("%d")))
    final_df['month']=final_df['dataid'].dt.date.apply(lambda x: int(x.strftime("%m")))
    

    # Time to float, e.g 07:30 == 7.5 and 21:00 == 21
    final_df['time'] = final_df['dataid'].dt.time
    final_df['time']=final_df['time'].apply(lambda x: x.hour + x.minute/60)
    final_df['time']=final_df['time'].astype('float')

    # Drop redundant columns
    final_df = final_df.drop(columns=['dataid', 'PCO2.CO2_umm','PCO2.H2O_mmm'])
    
    # Remove rows with NaN values in target (TOTAL)
    final_df = final_df.dropna(subset = ['TOTAL'])
    
    # Remove measures taken when ship was in harbour
    final_df = remove_ports_df(df_ports, final_df)
    
    # Remove rows where column values are out of acceptable range
    final_df = final_df.drop(final_df[final_df['FerryBox.C3_Temperature'] > 35].index)
    final_df = final_df.reset_index(drop=True)

    # Drop date
    final_df = final_df.drop(columns=['date'])
    
    # Save dataframe to file
    final_df.to_csv(f'{filename}.csv', index = False)

    return final_df


In [53]:
final_df = combine_files(echo, sensor, df_fug, 'leg234_data')

In [54]:
df_fug

Unnamed: 0,dataid,FCO2
0,2021-08-20 18:54:00,357.5717
1,2021-08-20 18:56:00,353.6150
2,2021-08-20 18:58:00,350.6545
3,2021-08-20 18:59:00,349.4867
4,2021-08-20 19:01:00,347.8791
...,...,...
47876,2021-12-17 18:36:00,609.3739
47877,2021-12-17 18:38:00,616.5267
47878,2021-12-17 18:39:00,621.6623
47879,2021-12-17 18:43:00,613.0097
