# Data pre-processing. PSZMP

PSZMP dataset.

2/10-9,6,4, 1/31/2024

In [1]:
from datetime import datetime
import json
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd

from data_preprocess import read_and_parse_sourcedata

## Read the data

In [2]:
data_pth = Path(".")

Set to `True` when debugging. `csv` ﬁles will not be exported when `debug_no_csvexport = True`

In [3]:
debug_no_csvexport = False

## Process JSON file containing common mappings and strings

In [4]:
with open(data_pth / 'common_mappings.json') as f:
    common_mappings = json.load(f)

In [5]:
DatasetCode = common_mappings['datasetcode']
cruises = common_mappings['cruises']
stations = common_mappings['stations']
net_tow = common_mappings['net_tow']

iso8601_format = common_mappings['iso8601_format']
CRS = common_mappings['CRS']

## Pre-process data from csv for Event table

### Read the pre-processed csv file

`usecols` defines the columns that will be kept and the order in which they'll be organized

In [6]:
# From the Hood Canal dataset
# usecols = [
#     'sample_code', 'mesh_size', 'FWC_DS', 
#     'station', 'latitude', 'longitude', 
#     'date', 'time_start', 'time', 'day_night', 
#     'depth_min', 'depth_max',
#     'net_code', 'extra_sample_token'
# ]

usecols = [
    'Sample Code', 
    'Station', 'Latitude', 'Longitude', 'Site Name', 'Basin', 'Sampling Group',
    'Sample Date', 'Sample Time', 'Day_Night', 'time',
    'Min Tow Depth (m)', 'Max Tow Depth (m)', 'Station Depth (m)',
    'Mesh Size', 'Tow Type', 
]

# eventsource_df = read_and_parse_sourcedata(test_n_rows=1000)[usecols]
eventsource_df = read_and_parse_sourcedata()[usecols]

# TODO: Rename more columns, if needed
eventsource_df.rename(
    columns={
        'Sample Code':'sample_code',
        'Station':'station',
        'Latitude':'latitude',
        'Longitude':'longitude',
        'Min Tow Depth (m)':'depth_min', 
        'Max Tow Depth (m)':'depth_max', 
        'Mesh Size': 'mesh_size',
    },
    inplace=True
)

In [7]:
len(eventsource_df)

153825

In [8]:
eventsource_df.head()

Unnamed: 0,sample_code,station,latitude,longitude,Site Name,Basin,Sampling Group,Sample Date,Sample Time,Day_Night,time,depth_min,depth_max,Station Depth (m),mesh_size,Tow Type
0,010218ELIV1151,ELIV,48.63795,-122.5694,Eliza Island,Bellingham Bay,LUM,2018-01-02 00:00:00,11:51:00,D,2018-01-02 11:51:00-07:00,0.0,110.0,120.7,200,Vertical
1,010218ELIV1151,ELIV,48.63795,-122.5694,Eliza Island,Bellingham Bay,LUM,2018-01-02 00:00:00,11:51:00,D,2018-01-02 11:51:00-07:00,0.0,110.0,120.7,200,Vertical
2,010218ELIV1151,ELIV,48.63795,-122.5694,Eliza Island,Bellingham Bay,LUM,2018-01-02 00:00:00,11:51:00,D,2018-01-02 11:51:00-07:00,0.0,110.0,120.7,200,Vertical
3,010218ELIV1151,ELIV,48.63795,-122.5694,Eliza Island,Bellingham Bay,LUM,2018-01-02 00:00:00,11:51:00,D,2018-01-02 11:51:00-07:00,0.0,110.0,120.7,200,Vertical
4,010218ELIV1151,ELIV,48.63795,-122.5694,Eliza Island,Bellingham Bay,LUM,2018-01-02 00:00:00,11:51:00,D,2018-01-02 11:51:00-07:00,0.0,110.0,120.7,200,Vertical


### Remove duplicates

Will return only unique samples, where one row = one sample.

In [9]:
eventsource_df = eventsource_df.drop_duplicates().sort_values(by='sample_code').reset_index(drop=True)
len(eventsource_df)

3567

- There are 51 unique stations
- Adding 'Tow Type' didn't lead to duplication of stations. ie, each station has only one tow type.
- Adding 'Station Depth (m)' did lead to huge duplication
- Darn, adding 'Sampling Group' led to some duplication (60 records instead of 51). ie, some stations were visited by more than one 'Sampling Group'

In [10]:
stations_cols = ['station', 'latitude', 'longitude', 'Site Name', 'Basin', 'Sampling Group', 'Tow Type']

In [11]:
stations_unique_df = eventsource_df[stations_cols].drop_duplicates().sort_values(by='station').reset_index(drop=True)
len(stations_unique_df)

60

In [12]:
sorted(stations_unique_df['Sampling Group'].unique())

['HCSEG/DOE',
 'KC',
 'KWT',
 'LUM',
 'NIT',
 'NOAA',
 'NOAA/STIL',
 'P Stations',
 'PGST/WDFW',
 'TUL']

In [13]:
stations_unique_df['Sampling Group'].value_counts()

NIT           10
PGST/WDFW      8
TUL            8
KWT            8
NOAA           6
KC             6
NOAA/STIL      5
LUM            4
P Stations     3
HCSEG/DOE      2
Name: Sampling Group, dtype: int64

In [14]:
stations_unique_df.head(60)

Unnamed: 0,station,latitude,longitude,Site Name,Basin,Sampling Group,Tow Type
0,ADID,48.00274,-122.6374,Admiralty Inlet,Admiralty Inlet,PGST/WDFW,Oblique
1,ADIM,47.99915,-122.6664,Admiralty Inlet,Admiralty Inlet,PGST/WDFW,Oblique
2,ADIS,47.99063,-122.6872,Admiralty Inlet,Admiralty Inlet,PGST/WDFW,Oblique
3,ADIV,48.00273,-122.636,Admiralty Inlet,Admiralty Inlet,PGST/WDFW,Vertical
4,CAMD,48.06646,-122.3956,Camano Head,Whidbey Basin,TUL,Oblique
5,CAMM,48.06536,-122.3905,Camano Head,Whidbey Basin,TUL,Oblique
6,CAMS,48.06469,-122.3886,Camano Head,Whidbey Basin,TUL,Oblique
7,CAMV,48.05901,-122.3873,Camano Head,Whidbey Basin,TUL,Vertical
8,COW1S,48.68448,-123.04,Cowlitz,San Juan,KWT,Oblique
9,COW2D,48.68303,-123.0412,Cowlitz,San Juan,KWT,Oblique


Generate unique station records that also include `time`. That is, each unique station visit.

In [49]:
stationvisits_unique_df = (
    eventsource_df[stations_cols + ['time']].drop_duplicates()
    .sort_values(by='station').reset_index(drop=True)
)
len(stationvisits_unique_df)

3567

Same number of records as `eventsource_df`! Note that `stationvisit_df` in the Event notebook is just slightly smaller, with 3551 records. That DF is based on a groupby where time and depth are summarized to their min & max.

In [50]:
len(stationvisits_unique_df) == len(eventsource_df)

True

### Come up with a "cruise code" based on Sampling Group and a date or counter

- Ideally the date string would be yyyymm
- Challenge is if for a given outing, the Sampling Group went out over a period of that spanned two months (eg, 5/31 & 6/1). Then the scheme would would break them up into separate outings
- Start by just testing the yyyymm approach, then examine the results to decide next steps
- But maybe it's ok not to try to aggregate every single unique outing into a common cruise code! Or it's something we could revisit later if we think it's worthwhile.

In [25]:
eventsource_df['date_yyyymm'] = eventsource_df['time'].apply(lambda dt: dt.strftime("%Y%m"))
# eventsource_df['cruise_code'] = eventsource_df['date_yyyymm'].apply(lambda s: cruises[s])
eventsource_df['cruise_code'] = (
    eventsource_df['date_yyyymm'] + "_" + eventsource_df['Sampling Group']
)

In [26]:
len(eventsource_df['cruise_code'].unique())

638

In [27]:
eventsource_df.head(5)

Unnamed: 0,sample_code,station,latitude,longitude,Site Name,Basin,Sampling Group,Sample Date,Sample Time,Day_Night,time,depth_min,depth_max,Station Depth (m),mesh_size,Tow Type,date_yyyymm,cruise_code
0,010218ELIV1151,ELIV,48.63795,-122.5694,Eliza Island,Bellingham Bay,LUM,2018-01-02 00:00:00,11:51:00,D,2018-01-02 11:51:00-07:00,0.0,110.0,120.7,200,Vertical,201801,201801_LUM
1,010322KSBP01D0815,KSBP01D,47.74396,-122.4282,Point Jefferson,Central Basin,KC,2022-01-03 00:00:00,08:15:00,D,2022-01-03 08:15:00-07:00,22.0,0.0,275.0,335,Oblique,202201,202201_KC
2,010422LSNT01D1323,LSNT01D,47.53333,-122.4333,Point Williams,Central Basin,KC,2022-01-04 00:00:00,13:23:00,D,2022-01-04 13:23:00-07:00,38.0,0.0,210.0,335,Oblique,202201,202201_KC
3,010422LSNT01V1305,LSNT01V,47.53333,-122.4333,Point Williams,Central Basin,KC,2022-01-04 00:00:00,13:05:00,D,2022-01-04 13:05:00-07:00,0.0,200.0,210.0,200,Vertical,202201,202201_KC
4,010422NSEX01V1049,NSEX01V,47.35862,-122.3871,East Passage,Central Basin,KC,2022-01-04 00:00:00,10:49:00,D,2022-01-04 10:49:00-07:00,0.0,170.0,180.0,200,Vertical,202201,202201_KC


In [47]:
df = (
    pd.DataFrame(eventsource_df['cruise_code'].value_counts())
    .reset_index()
    .rename(columns={'index':'cruise_code', 'cruise_code':'count'})
    .sort_values(by='cruise_code')
).to_csv("./cruise_codes-fromSGandYYYYMM.csv", index=False)

Let's see what happens if we include the day (YYYYMMDD) in the cruise code 

In [53]:
len(eventsource_df['time'].apply(lambda dt: dt.strftime("%Y%m%d")).unique())

1045

In [54]:
eventsource_df['date_yyyymmdd'] = eventsource_df['time'].apply(lambda dt: dt.strftime("%Y%m%d"))
# eventsource_df['cruise_code'] = eventsource_df['date_yyyymm'].apply(lambda s: cruises[s])
eventsource_df['cruise_code_yyyymmdd'] = (
    eventsource_df['date_yyyymmdd'] + "_" + eventsource_df['Sampling Group']
)

In [55]:
len(eventsource_df['cruise_code_yyyymmdd'].unique())

1332

In [64]:
cruisecode_yyyymmdd_unique_df = (
    # eventsource_df[['cruise_code'] + stations_cols].drop_duplicates()
    # .sort_values(by=['cruise_code', 'station']).reset_index(drop=True)
    eventsource_df[['cruise_code', 'cruise_code_yyyymmdd'] + stations_cols].drop_duplicates()
    .sort_values(by=['cruise_code_yyyymmdd', 'station']).reset_index(drop=True)
)
len(cruisecode_yyyymmdd_unique_df)

3567

In [73]:
cruisecode_yyyymmdd_unique_df.tail(20)

Unnamed: 0,cruise_code,cruise_code_yyyymmdd,station,latitude,longitude,Site Name,Basin,Sampling Group,Tow Type
3547,202211_KWT,20221113_KWT,WAT1V,48.43457,-122.8037,Watmough Bay,San Juan,KWT,Vertical
3548,202211_KC,20221114_KC,KSBP01D,47.74396,-122.4282,Point Jefferson,Central Basin,KC,Oblique
3549,202211_KC,20221114_KC,KSBP01V,47.74396,-122.4282,Point Jefferson,Central Basin,KC,Vertical
3550,202211_KC,20221115_KC,LSNT01D,47.53333,-122.4333,Point Williams,Central Basin,KC,Oblique
3551,202211_KC,20221115_KC,LSNT01V,47.53333,-122.4333,Point Williams,Central Basin,KC,Vertical
3552,202211_KC,20221115_KC,NSEX01V,47.35862,-122.3871,East Passage,Central Basin,KC,Vertical
3553,202212_KC,20221205_KC,KSBP01D,47.74396,-122.4282,Point Jefferson,Central Basin,KC,Oblique
3554,202212_KC,20221205_KC,KSBP01V,47.74396,-122.4282,Point Jefferson,Central Basin,KC,Vertical
3555,202212_KC,20221206_KC,LSNT01D,47.53333,-122.4333,Point Williams,Central Basin,KC,Oblique
3556,202212_KC,20221206_KC,LSNT01V,47.53333,-122.4333,Point Williams,Central Basin,KC,Vertical


In [68]:
len(cruisecode_yyyymmdd_unique_df['cruise_code_yyyymmdd'].unique())

1332

In [74]:
cruisecode_yyyymmdd_unique_df['cruise_code_yyyymmdd'].value_counts()

20151021_PGST/WDFW    8
20140619_NIT          8
20140904_NIT          8
20140624_NIT          8
20150608_TUL          8
                     ..
20180824_NOAA         1
20211216_LUM          1
20160330_NIT          1
20220103_KC           1
20181212_NIT          1
Name: cruise_code_yyyymmdd, Length: 1332, dtype: int64

## Mesh size and tow type

Only two sets occur: Vertical+200um and Oblique+335um. That makes it easy to handle! Is it the same as the Hood Canal dataset?

In [76]:
eventsource_df[['Tow Type', 'mesh_size']].value_counts()

Tow Type  mesh_size
Vertical  200          1802
Oblique   335          1765
dtype: int64

## Day vs Night sampling

In [77]:
eventsource_df['Day_Night'].value_counts()

D    3331
N     220
Name: Day_Night, dtype: int64

## Package versions

In [16]:
print(
    f"{datetime.utcnow()} +00:00\n"
    f"pandas: {pd.__version__}, geopandas: {gpd.__version__}"
)

2024-02-10 21:23:41.080205 +00:00
pandas: 1.5.3, geopandas: 0.12.2
