In [None]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import afscgap
import plotly.express as px

In [None]:
def give_it_a_go(result, method_name, **kwargs):
    try:
        return getattr(result, method_name)(**kwargs)
    except AssertionError:
        return None

def to_dict(result):
    """
    Convert a single result to a dictionary.

    (The .to_dict() method of the Result object does not work as expected.)
    """
    return {
            'year': give_it_a_go(result, 'get_year'),
            'srvy': give_it_a_go(result, 'get_srvy'),
            'survey': give_it_a_go(result, 'get_survey'),
            'survey_id': give_it_a_go(result, 'get_survey_id'),
            'cruise': give_it_a_go(result, 'get_cruise'),
            'haul': give_it_a_go(result, 'get_haul'),
            'stratum': give_it_a_go(result, 'get_stratum'),
            'station': give_it_a_go(result, 'get_station'),
            'vessel_name': give_it_a_go(result, 'get_vessel_name'),
            'vessel_id': give_it_a_go(result, 'get_vessel_id'),
            'date_time': give_it_a_go(result, 'get_date_time'),
            'latitude_dd': give_it_a_go(result, 'get_latitude'),
            'longitude_dd': give_it_a_go(result, 'get_longitude'),
            'species_code': give_it_a_go(result, 'get_species_code'),
            'common_name': give_it_a_go(result, 'get_common_name'),
            'scientific_name': give_it_a_go(result, 'get_scientific_name'),
            'taxon_confidence': give_it_a_go(result, 'get_taxon_confidence'),
            'cpue_kgha': give_it_a_go(result, 'get_cpue_weight_maybe', units='kg/ha'),
            'cpue_kgkm2': give_it_a_go(result, 'get_cpue_weight_maybe', units='kg/km2'),
            'cpue_kg1000km2': give_it_a_go(result, 'get_cpue_weight_maybe', units='kg1000/km2'),
            'cpue_noha': give_it_a_go(result, 'get_cpue_count_maybe', units='count/ha'),
            'cpue_nokm2': give_it_a_go(result, 'get_cpue_count_maybe', units='count/km2'),
            'cpue_no1000km2': give_it_a_go(result, 'get_cpue_count_maybe', units='count1000/km2'),
            'weight_kg': give_it_a_go(result, 'get_weight_maybe', units='kg'), # changed this from get_weight
            'count': give_it_a_go(result, 'get_count_maybe'), # changed this from get_count
            'bottom_temperature_c': give_it_a_go(result, 'get_bottom_temperature_maybe',
                units='c'
            ),
            'surface_temperature_c': give_it_a_go(result, 'get_surface_temperature_maybe',
                units='c'
            ),
            'depth_m': give_it_a_go(result, 'get_depth', units='m'),
            'distance_fished_km': give_it_a_go(result, 'get_distance_fished', units='km'),
            'net_width_m': give_it_a_go(result, 'get_net_width', units='m'),
            'net_height_m': give_it_a_go(result, 'get_net_height', units='m'),
            'area_swept_ha': give_it_a_go(result, 'get_area_swept', units='ha'),
            'duration_hr': give_it_a_go(result, 'get_duration', units='hr')
        }

In [None]:
years = list(range(1982, 2025))
print(len(years))

In [None]:
for year in tqdm(years):
    try:
        file_path = f'data/raw_year_{year}.gz'
        if os.path.exists(file_path):
            continue
        query = afscgap.Query()
        query.filter_year(eq=year)
        query.set_presence_only(False)
        results = query.execute()

        rows = [
            to_dict(result)
            for result in results
        ]

        data = pd.DataFrame(rows)
        data.to_csv(file_path, index=False)
    except Exception as e:
        print(f'Failed to fetch data for year {year}')
        print(e)
        continue

In [None]:
data = pd.read_csv('data/raw_year_1982.gz')
data['taxon_confidence'].unique()

## Species

In [None]:
dfs = []
for year in tqdm(years):
    try:
        file_path = f'data/raw_year_{year}.gz'
        df = pd.read_csv(file_path)
        df = df[['year', 'scientific_name', 'cpue_kgha', 'taxon_confidence']]
        dfs.append(df)
    except pd.errors.EmptyDataError:
        continue

data = pd.concat(dfs)
data.head()

In [None]:
data['scientific_name'].nunique()

In [None]:
df = data[data['cpue_kgha'] > 0]
df['taxon_confidence'] = df['taxon_confidence'].fillna('Unknown')
df['taxon_confidence'] = df['taxon_confidence'].isin(['High', 'Moderate'])
df = (
    df.groupby(['scientific_name', 'taxon_confidence']).size().reset_index().rename(columns={0: 'count'}).sort_values('count', ascending=False)
    .merge(df.groupby(['scientific_name']).size().reset_index().rename(columns={0: 'total_count'}), on='scientific_name')
    .assign(percentage=lambda x: x['count'] / x['total_count'])
    .sort_values('percentage', ascending=False)
)
df = df[df['taxon_confidence']]

In [None]:
df

In [None]:
px.ecdf(df, x="percentage")

In [None]:
df[df['scientific_name'].isin(df[df['percentage'] >= 0.95]['scientific_name'].unique())]

In [None]:
species = set(df[df['percentage'] >= 0.95]['scientific_name'].unique())
len(species)

In [None]:
high_confidence_species = set(s for s in species if 'sp.' not in s and ' ' in s)
len(high_confidence_species)

In [None]:
dfs = []
for year in tqdm(years):
    try:
        file_path = f'data/raw_year_{year}.gz'
        df = pd.read_csv(file_path)
        df = df[['year', 'station', 'stratum', 'haul']]
        dfs.append(df)
    except pd.errors.EmptyDataError:
        continue

hauls = pd.concat(dfs)
hauls = hauls.drop_duplicates()
hauls.head()

In [None]:
hauls.shape

In [None]:
hauls['station'].nunique()

In [None]:
df = (
    data[(data['cpue_kgha'] > 0) & data['scientific_name'].isin(high_confidence_species)].groupby(['scientific_name']).size().reset_index().rename(columns={0: 'count'}).sort_values('count', ascending=False)
)
df['occurrence_likelihood'] = df['count'] / hauls.shape[0]
df.head()

In [None]:
px.ecdf(df, x="occurrence_likelihood")

In [None]:
chosen_species = set(df[df['occurrence_likelihood'] >= 0.05]['scientific_name'].unique())
len(chosen_species)

## Pulling the Columns We Care About

In [None]:
data = pd.read_csv('data/raw_year_1982.gz')
data.head()

In [None]:
data.columns

In [None]:
data.groupby(['stratum'])['survey'].nunique().describe()

In [None]:
data['stratum'].nunique()

In [None]:
columns = [
    'year', 'haul', 'station', # identifies the haul
    'srvy', 'stratum', # identifies the survey area
    'distance_fished_km', 'duration_hr', # haul speed information
    'surface_temperature_c', 'bottom_temperature_c', # temperature information,
    'depth_m', # depth information
    'scientific_name', 'cpue_kgha', # CPUE information
]
data[columns]

In [None]:
dfs = []
for year in tqdm(years):
    try:
        file_path = f'data/raw_year_{year}.gz'
        df = pd.read_csv(file_path)
        df = df[columns]
        df = df[df['scientific_name'].isin(chosen_species) & (df['cpue_kgha'] > 0)]
        dfs.append(df)
    except pd.errors.EmptyDataError:
        continue

data = pd.concat(dfs)
data = data.drop_duplicates()
data.head()

In [None]:
df = (
    data[[c for c in data.columns if c not in ('scientific_name', 'cpue_kgha')]].drop_duplicates(['year', 'station', 'stratum', 'haul'])
).merge(
    pd.DataFrame({'scientific_name': list(chosen_species)}), how='cross'
).merge(
    data[['year', 'station', 'stratum', 'haul', 'scientific_name', 'cpue_kgha']]
    .drop_duplicates(['year', 'station', 'stratum', 'haul', 'scientific_name']),
    how='left'
)
df['cpue_kgha'] = df['cpue_kgha'].fillna(0.0)
print(df.shape)
df.head()

In [None]:
df.to_csv('data/processed.gz', index=False)

## Investigating the Data

In [None]:
del df 
del hauls 
del data

In [None]:
data = pd.read_csv('data/processed.gz')
print(data.shape)
data.head()

In [None]:
px.histogram(data['distance_fished_km'].sample(10000))

In [None]:
data['kmh'] = data['distance_fished_km'] / data['duration_hr']
px.histogram(data['kmh'].sample(10000))

In [None]:
px.histogram(data['duration_hr'].sample(10000))

In [None]:
px.histogram(data['surface_temperature_c'].sample(10000))

In [None]:
px.histogram(data['bottom_temperature_c'].sample(10000))

In [None]:
px.histogram(data['depth_m'].sample(10000))

In [None]:
for column in ['surface_temperature_c', 'bottom_temperature_c', 'stratum', 'depth_m', 'duration_hr']:
    print(f'{column}:')
    print(
        data[np.isnan(data[column])].shape[0] / data[column].shape[0]
    )


In [None]:
print(data.shape)
data = data.dropna(subset=['surface_temperature_c', 'bottom_temperature_c', 'depth_m', 'duration_hr'])
print(data.shape)

In [None]:
base = data[['year', 'srvy', 'station', 'stratum', 'haul', 'depth_m', 'duration_hr', 'surface_temperature_c', 'bottom_temperature_c']].drop_duplicates()
base

In [None]:
base.drop_duplicates(['year', 'srvy', 'station', 'stratum', 'haul']).shape

In [None]:
for species in tqdm(list(data['scientific_name'].unique())):
    df = data[data['scientific_name'] == species][['year', 'srvy', 'station', 'stratum', 'haul', 'cpue_kgha']]
    df = df.rename({'cpue_kgha': species}, axis=1)
    base = base.merge(df, on=['year', 'srvy', 'station', 'stratum', 'haul'], how='inner')

In [None]:
print(base.shape)
base.head()

In [None]:
base.to_csv('data/processed_final.gz', index=False)

In [None]:
data = pd.read_csv('data/processed_final.gz')

In [None]:
data['srvy'].unique()

## Plots

In [None]:
dfs = []
for year in tqdm(years):
    try:
        file_path = f'data/raw_year_{year}.gz'
        df = pd.read_csv(file_path)
        df = df[['survey', 'station', 'longitude_dd', 'latitude_dd']].drop_duplicates(['survey', 'station'])
        dfs.append(df)
    except pd.errors.EmptyDataError:
        continue

data = pd.concat(dfs).drop_duplicates(['survey', 'station'])
data.head()

In [None]:
px.scatter_geo(
    data[['survey', 'station', 'latitude_dd', 'longitude_dd']].drop_duplicates(['survey', 'station']),
    lat='latitude_dd',
    lon='longitude_dd',
    color='survey',
    title='Stations by Survey'
)

In [None]:
data = pd.read_csv('data/raw_year_2024.gz').drop_duplicates(['survey', 'station'])
print(data.shape)
data.head()

In [None]:
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import pandas as pd

# Define features to plot
features = ['depth_m', 'surface_temperature_c', 'bottom_temperature_c']
num_features = len(features)

# Create subplots with Cartopy projection
fig, axes = plt.subplots(num_features, 1, figsize=(18, 6), subplot_kw={'projection': ccrs.PlateCarree()})

for i, feature in enumerate(features):
    ax = axes[i]
    
    # Add map features (coastlines, land, borders)
    ax.set_extent([
        data[data['longitude_dd'] < 0]['longitude_dd'].min(), 
        data[data['longitude_dd'] < 0]['longitude_dd'].max(), 
        data['latitude_dd'].min(),
        data['latitude_dd'].max()
    ])  # Adjust extent based on your dataset
    ax.add_feature(cfeature.COASTLINE)
    ax.add_feature(cfeature.LAND, edgecolor='black', facecolor='lightgray')
    ax.add_feature(cfeature.BORDERS, linestyle=':')
    
    # Scatter plot with individual color scale
    sc = ax.scatter(data['longitude_dd'], data['latitude_dd'], c=data[feature], cmap='viridis', s=10, transform=ccrs.PlateCarree())

    # Add colorbar for each subplot
    cbar = plt.colorbar(sc, ax=ax, orientation="vertical", fraction=0.046, pad=0.04)
    cbar.set_label(feature)

    # Title and labels
    #ax.set_title(feature)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

In [None]:
data = pd.read_csv('data/processed_final.gz')
print(data.shape)
data.head()

In [None]:
fig, axes = plt.subplots(1, num_features, figsize=(18, 6))

for i, feature in enumerate(features):
    ax = axes[i]
    
    # Scatter plot with individual color scale
    sc = ax.hist(data[feature])

    # Add colorbar for each subplot
    #cbar = plt.colorbar(sc, ax=ax, orientation="vertical", fraction=0.046, pad=0.04)
    #cbar.set_label(feature)
    ax.set_title(feature)

plt.tight_layout()
plt.show()

In [None]:
species = list(data.columns[9:])
species = sorted([((1 - data[data[s] == 0].shape[0] / data.shape[0]), s) for s in species])
soi = species[0], species[len(species) // 2], species[-1]
soi

In [None]:
fig, axes = plt.subplots(1, len(soi), figsize=(18, 6))

for i, (prop, species) in enumerate(soi):
    ax = axes[i]
    
    # Scatter plot with individual color scale
    sc = ax.hist((data[data[species] > 0][species]) ** (1/4))

    # Add colorbar for each subplot
    #cbar = plt.colorbar(sc, ax=ax, orientation="vertical", fraction=0.046, pad=0.04)
    #cbar.set_label(feature)
    ax.set_title(f'{species} ({prop:.2f})')

plt.tight_layout()
plt.show()

In [None]:
((data[data['Albatrossia pectoralis'] > 0]['Albatrossia pectoralis']) ** (1/4)).describe()

In [None]:
rows = []
for species in data.columns[9:]:
    rows.append({
        'species': species,
        'nonzero_percentage': 1 - data[data[species] == 0].shape[0] / data.shape[0],
        'cpue_kgha_mean': data[data[species] > 0][species].mean(),
        'cpue_kgha_4th_root_std': (data[data[species] > 0][species] ** (1/4)).std(),
    })

df = pd.DataFrame(rows).sort_values('species')
df