### Our first goal is to import raw data from Matlab files and coerce to df's

In [1]:
## Imports
from scipy import io
import pandas as pd
import numpy as np
import os

In [2]:
## Load Matlab files
file_biomass = r'C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\raw\biomass_abund.mat'
mat_biomass = io.loadmat(file_biomass)
file_landings = r'C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\raw\landings.mat'
mat_landings = io.loadmat(file_landings)
file_ecosystem = r'C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\raw\metadata_ecosystem.mat'
mat_ecosystem = io.loadmat(file_ecosystem)

In [3]:
## Coerce mat_biomass to df_biomass
remove_keys = ['__header__', '__version__', '__globals__'] # download data irrelevant to data analysis
for key in remove_keys:
    mat_biomass.pop(key) # remove these keys from the imported Matlab dictionary

for key in list(mat_biomass.keys()):
    if len(mat_biomass[key])==1:
        mat_biomass[key] = mat_biomass[key][0] # reassign ndarray as value, dropping dtype information

df_biomass = pd.DataFrame.from_dict(mat_biomass, orient= 'index').T

In [4]:
## Coerce mat_landings to df_landings
remove_keys = ['__header__', '__version__', '__globals__'] # download data irrelevant to data analysis
for key in remove_keys:
    mat_landings.pop(key) # remove these keys from the imported MATLAB dictionary


for key in list(mat_landings.keys()):
    if len(mat_landings[key])==1:
        mat_landings[key] = mat_landings[key][0] # reassign ndarray as value, dropping dtype information

df_landings = pd.DataFrame.from_dict(mat_landings, orient= 'index').T

In [5]:
## Coerce mat_ecosystem to df_ecosystem
remove_keys = ['__header__', '__version__', '__globals__'] # download data irrelevant to data analysis
for key in remove_keys:
    mat_ecosystem.pop(key) # remove these keys from the imported MATLAB dictionary

df_ecosystem = pd.DataFrame.from_dict(mat_ecosystem, orient= 'index').T

### Our second goal is to load and coerce AMO text data

In [6]:
## Load AMO data, to df with read_table
# Adapted from: https://stackoverflow.com/questions/48063620/pandas-read-csv-for-multiple-delimiters
file_AMO = r'C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\raw\amon.sm.long.data'
df_AMO = pd.read_table(file_AMO, delimiter='\s+|  |    ', skiprows=1, engine='python')
df_AMO.columns = ['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [7]:
# Slice df to rows with real year value, and complete data for 12 months
dec_check = df_AMO['Dec'] > -99 # Only rows with float type, and w/in expected value
year_check = df_AMO['Year'].str.len() == 4 # Only rows where Year column is a 4 digit year string
df_AMO = df_AMO[dec_check & year_check]

### Our third goal is to load global ocean temperature anomally data

In [8]:
# List for all .asc files in timeseries folder, filter for only annual ocean data
# Adapted from: https://www.geeksforgeeks.org/python-list-files-in-a-directory/
temperature_folder = r"C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\raw\timeseries" 
dir_list = pd.Series(os.listdir(temperature_folder))
ocean_files = [file for file in dir_list if 'ann.ocean.' in file]
print(ocean_files)

['aravg.ann.ocean.00N.30N.v4.0.1.201711.asc', 'aravg.ann.ocean.00N.90N.v4.0.1.201711.asc', 'aravg.ann.ocean.20N.90N.v4.0.1.201711.asc', 'aravg.ann.ocean.20S.20N.v4.0.1.201711.asc', 'aravg.ann.ocean.30N.60N.v4.0.1.201711.asc', 'aravg.ann.ocean.30S.00N.v4.0.1.201711.asc', 'aravg.ann.ocean.60N.90N.v4.0.1.201711.asc', 'aravg.ann.ocean.60S.30S.v4.0.1.201711.asc', 'aravg.ann.ocean.60S.60N.v4.0.1.201711.asc', 'aravg.ann.ocean.90S.00N.v4.0.1.201711.asc', 'aravg.ann.ocean.90S.20S.v4.0.1.201711.asc', 'aravg.ann.ocean.90S.60S.v4.0.1.201711.asc', 'aravg.ann.ocean.90S.90N.v4.0.1.201711.asc']


These are all timeseries files for annual ocean temperature anomalies. Landings in our fishery metadata, I have searched for latitude of each:

Long_Island_Sound   41.0748° N

Georges_Bank    40.8155° N

Gulf_of_Maine   43.1336° N

Mid_Atlantic_Bight  39° 25' 1.2" N

Southern_New_England   40° N

https://www.fisheries.noaa.gov/new-england-mid-atlantic/commercial-fishing/southern-new-england-exemption-area

Chesapeake_Bay  37.5214° N

Narragansett_Bay    41.6220° N

Hudson_River (estuary)  40°42' N

Connecticut_Shoreline   41.2700° N

Delaware_Bay    39.1202° N

Ambrose_Channel 40.488215° N

It could be considered to subset data for the latitudes above, however this is global data over the latitude range. Oceans other than the Atlantic are in all dataset, so the decision made is to select global temperature anomalies for full range of latitude (90°S to 90°N)

In [9]:
ocean_file = [file for file in ocean_files if '90S.90N' in file] # select file for full latitude range

df_ocean_temp = pd.read_table(f'{temperature_folder}\{ocean_file[0]}', delimiter='\s+|   |    ', skiprows=0, engine='python')
# Adapted from: https://datascientyst.com/reset-column-names-index-pandas/
df_ocean_temp = df_ocean_temp.T.reset_index().T # move first row of data from column names into first row of df
df_ocean_temp[0] = df_ocean_temp[0].astype('int')
df_ocean_temp.columns = ['year', 'anomaly of temperature (K)', 'total error variance (K**2)', \
                    'high-frequency error variance (K**2)', 'low-frequency error variance (K**2)', \
                    'bias error variance (K**2)']

### Data cleaning will occur in the next notebook, for now write to CSV.

In [17]:
csv_folder = r"C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\interim"
df_biomass.to_csv(fr'{csv_folder}\biomass_data.csv')
df_landings.to_csv(fr'{csv_folder}\landings_data.csv')
df_ecosystem.to_csv(fr'{csv_folder}\ecosystem_data.csv')
df_AMO.to_csv(fr'{csv_folder}\AMO_data.csv')
df_ocean_temp.to_csv(fr'{csv_folder}\ocean_temp_data.csv')