In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [None]:
#Takes list of path names as input, concats files and outputs two dataframes (echos and sensors)
def load_files(echo_files, sensor_files):
    echos = pd.DataFrame()
    sensors = pd.DataFrame()
    for path in echo_files:
        dfe = pd.read_csv(path)
        echos = pd.concat([echos, dfe])
        
    for path in sensor_files:
        dfs = pd.read_csv(path)
        sensors = pd.concat([sensors, dfs])
        
    return echos, sensors


In [None]:
echo, sensor = load_files(['leg2_echo_prepped.csv', 'leg3_echo_prepped.csv','leg4_echo_prepped.csv'],
           ['leg2_prepped.csv', 'leg3_prepped.csv','leg4_prepped.csv'])

In [None]:
def combine_files(df_echo, df_sensors):
    # datetime type is converted as object when saved to csv, convert back to datetime after reading
    df_echo.dataid = pd.to_datetime(df_echo.dataid, format='%Y-%m-%d %H:%M:%S')
    df_sensors.dataid = pd.to_datetime(df_sensors.dataid, format='%Y-%m-%d %H:%M:%S')
    
    # fix outliers due to freshwater rinsing of the ferrybox system every night
    # avg salinity in atlantic is 33-37 ppm, median in dataset is 35.179. Replace all values below 30 (outliers) with median
    salinity_median = df_sensors['FerryBox.SBE45_Salinity'].median()
    df_sensors['FerryBox.SBE45_Salinity'] = np.where(df_sensors['FerryBox.SBE45_Salinity']<=30 , salinity_median, df_sensors['FerryBox.SBE45_Salinity'])

    conduc_median = df_sensors['FerryBox.SBE45_Conductivity'].median()
    df_sensors['FerryBox.SBE45_Conductivity'] = np.where(df_sensors['FerryBox.SBE45_Conductivity']<=3 , conduc_median, df_sensors['FerryBox.SBE45_Conductivity'])
    
    # replace NaNs with median 
    df_sensors = df_sensors.fillna(df_sensors.median())
    
    # group in intervals of 30 min
    df_sensors = df_sensors.groupby(pd.Grouper(key="dataid", freq="30T")).median()
    df_sensors = df_sensors.reset_index()

    # join sensor and echo dataframes
    final_df = df_echo.merge(df_sensors, on='dataid', how='outer')
    
   
    # Split datetime to date and time separately
    # Date to integer YYYYMMDD
    final_df['date'] = final_df['dataid'].dt.date
    final_df['date']=final_df['date'].apply(lambda x: int(x.strftime("%Y%m%d")))

    # Time to float, e.g 07:30 == 7.5 and 21:00 == 21
    final_df['time'] = final_df['dataid'].dt.time
    final_df['time']=final_df['time'].apply(lambda x: x.hour + x.minute/60)
    final_df['time']=final_df['time'].astype('float')

    # Maybe drop original dataid? For now to numeric

    final_df.dataid = pd.to_numeric(final_df.dataid)

    return final_df


In [None]:
final_df = combine_files(echo, sensor)

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt

corrMatrix = final_df.corr()
plt.figure(figsize=(30, 20))
heatmap = sn.heatmap(corrMatrix, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap Sensors', fontdict={'fontsize':18}, pad=12);
plt.show()

figure = heatmap.get_figure()   
figure.savefig('corr.jpg', dpi=300, bbox_inches='tight')

In [None]:
final_df

In [None]:
final_df.describe()

In [89]:
final_df.to_csv('leg234_data', index = False)

In [None]:
ferrybox_data_c = pd.DataFrame(ferrybox_data_c)

ferrybox_data_c =gpd.GeoDataFrame(

    ferrybox_data_c, geometry=gpd.points_from_xy(ferrybox_data_c.latitude, ferrybox_data_c.longitude))

ferrybox_data_c.reset_index(level=0, inplace=True)

ferrybox_data_c['timestamp'] = ferrybox_data_c['timestamp'].astype(str)

ferrybox_data_c = ferrybox_data_c.dropna()