### Load packages and data files from previous notebook

In [1]:
## Imports
from scipy import io
import pandas as pd
import numpy as np
import os
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
## Load Matlab files
file_path = r'C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\interim'
AMO_file, biomass_file, ecosystem_file, landings_file, ocean_temp_file = [fr'{file_path}\AMO_data.csv', \
    fr'{file_path}\biomass_data.csv', fr'{file_path}\ecosystem_data.csv', fr'{file_path}\landings_data.csv', \
    fr'{file_path}\ocean_temp_data.csv']

AMO_df = pd.read_csv(AMO_file).drop(['Unnamed: 0'], axis=1)
biomass_df = pd.read_csv(biomass_file).drop(['Unnamed: 0'], axis=1)
ecosystem_df = pd.read_csv(ecosystem_file).drop(['Unnamed: 0'], axis=1)
landings_df = pd.read_csv(landings_file).drop(['Unnamed: 0'], axis=1)
ocean_temp_df = pd.read_csv(ocean_temp_file).drop(['Unnamed: 0'], axis=1)

### Generate df reports and save to files

In [25]:
def make_html_report (df, name):
    report = df.profile_report(sort=None, html={'style':{'full_width': True}}, progress_bar=True)
    report.to_file(fr"C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\reports\{name}_data_report.html")
    print(f'{name} done!')

make_html_report(AMO_df, 'AMO')
make_html_report(biomass_df, 'biomass')
make_html_report(ecosystem_df, 'ecosystem')
make_html_report(landings_df, 'landings')
make_html_report(ocean_temp_df, 'ocean_temp')

Summarize dataset: 100%|██████████| 195/195 [01:07<00:00,  2.88it/s, Completed]                   
Generate report structure: 100%|██████████| 1/1 [00:12<00:00, 12.93s/it]
Render HTML: 100%|██████████| 1/1 [00:11<00:00, 11.30s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 19.61it/s]


AMO done!


Summarize dataset: 100%|██████████| 153/153 [00:52<00:00,  2.90it/s, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [00:14<00:00, 14.12s/it]
Render HTML: 100%|██████████| 1/1 [00:09<00:00,  9.45s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 17.54it/s]


biomass done!


Summarize dataset: 100%|██████████| 18/18 [00:01<00:00, 14.10it/s, Completed]                    
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.28s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.71it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 111.11it/s]


ecosystem done!


Summarize dataset: 100%|██████████| 36/36 [00:06<00:00,  5.85it/s, Completed]                                             
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.83s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.52s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 71.44it/s]


landings done!


Summarize dataset: 100%|██████████| 55/55 [00:11<00:00,  4.59it/s, Completed]                                                                         
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.37s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.97s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 50.01it/s]

ocean_temp done!





## Data defining:

### In Biomass:
biomass in tons per square kilometer (t/km^2).
biomass_index in tons.
abundance in number per square kilometer.
abundance_index in number.

### In Landings:
landings_biomass in tons per square kilometer (t/km^2).
landings_biomass_index in tons.
landings_abundance in number per square kilometer.
landings_abundance_index in number.

## Decisions made for data cleaning:

Change nothing in ecosystem or ocean temp data. Annualize AMO data instead of month-by-month to AMO_ann_data.csv. All ecosystems in landings_data area also found in biomass_data. Combine biomass and landing df's by renaming landing df columns and dropping biomass df columns (drop SE and unit columns)

In [3]:
biomass_spec =  list(set(biomass_df['species'].values))
biomass_loc = list(set(biomass_df['ecosystem'].values))
landings_spec = list(set(landings_df['species'].values))
landings_loc = list(set(landings_df['ecosystem'].values))
biomass_spec.sort()
biomass_loc.sort()
landings_spec.sort()
landings_loc.sort()

print(biomass_spec)
print(landings_spec)

['AFRICAN_POMPANO', 'ALEWIFE', 'AMERICAN_EEL', 'AMERICAN_LOBSTER', 'AMERICAN_SAND_LANCE', 'AMERICAN_SHAD', 'ATLANTIC_ANGEL_SHARK', 'ATLANTIC_ARGENTINE', 'ATLANTIC_BONITO', 'ATLANTIC_COD', 'ATLANTIC_CROAKER', 'ATLANTIC_CUTLASSFISH', 'ATLANTIC_HERRING', 'ATLANTIC_MACKEREL', 'ATLANTIC_MENHADEN', 'ATLANTIC_MOONFISH', 'ATLANTIC_SHARPNOSE_SHARK', 'ATLANTIC_SILVERSIDE', 'ATLANTIC_SPADEFISH', 'ATLANTIC_STURGEON', 'ATLANTIC_THREAD_HERRING', 'ATLANTIC_TOMCOD', 'African_pompano', 'Alewife', 'Alewife_and_Blueback_herring', 'Alewife_and_River_herring', 'American_Shad_1+', 'American_butterfish', 'American_eel', 'American_eel_age-1+', 'American_eel_yoy/larvae', 'American_fourspot_flounder', 'American_lobster', 'American_plaice', 'American_sand_lance', 'American_sand_lance_yoy', 'American_shad', 'Anchovy_spp_yoy-est', 'Anemones', 'Arks', 'Atlantic_Needlefish', 'Atlantic_angel_shark', 'Atlantic_bonito', 'Atlantic_butterfish', 'Atlantic_cod', 'Atlantic_croaker', 'Atlantic_cutlassfish', 'Atlantic_halibut

In [4]:
print(biomass_df[biomass_df['species']=='ALEWIFE'])
print(biomass_df[biomass_df['species']=='Alewife'])

      ecosystem  species  year  biomass  biomass_SE  abundance  abundance_SE  \
19626        NJ  ALEWIFE  1988      NaN         NaN        NaN           NaN   
19627        NJ  ALEWIFE  1992      NaN         NaN        NaN           NaN   
19628        NJ  ALEWIFE  1993      NaN         NaN        NaN           NaN   
19629        NJ  ALEWIFE  1994      NaN         NaN        NaN           NaN   
19630        NJ  ALEWIFE  1996      NaN         NaN        NaN           NaN   
19631        NJ  ALEWIFE  1997      NaN         NaN        NaN           NaN   
19632        NJ  ALEWIFE  1999      NaN         NaN        NaN           NaN   
19633        NJ  ALEWIFE  2000      NaN         NaN        NaN           NaN   
19634        NJ  ALEWIFE  2001      NaN         NaN        NaN           NaN   
19635        NJ  ALEWIFE  2002      NaN         NaN        NaN           NaN   
19636        NJ  ALEWIFE  2004      NaN         NaN        NaN           NaN   
19637        NJ  ALEWIFE  2005      NaN 

In [5]:
print(biomass_df.columns)
print(landings_df.columns)

biomass_df.drop(['biomass_SE', 'abundance_SE', 'biomass_index_SE', 'abundance_index_units', 'abundance_index_SE'], 
    axis=1, inplace=True)
landings_df.rename(columns={'landings_biomass':'biomass', 'landings_abund':'abundance', 
    'landings_biomass_index':'biomass_index', 'landings_abund_index':'abundance_index'}, inplace=True)
print(biomass_df.columns)
print(landings_df.columns)

Index(['ecosystem', 'species', 'year', 'biomass', 'biomass_SE', 'abundance',
       'abundance_SE', 'biomass_index', 'biomass_index_SE', 'abundance_index',
       'abundance_index_units', 'abundance_index_SE', 'avg_len', 'avg_mass',
       'source', 'agency', 'season'],
      dtype='object')
Index(['ecosystem', 'species', 'year', 'landings_biomass', 'landings_abund',
       'landings_biomass_index', 'landings_abund_index', 'avg_len', 'avg_mass',
       'source', 'agency', 'season'],
      dtype='object')
Index(['ecosystem', 'species', 'year', 'biomass', 'abundance', 'biomass_index',
       'abundance_index', 'avg_len', 'avg_mass', 'source', 'agency', 'season'],
      dtype='object')
Index(['ecosystem', 'species', 'year', 'biomass', 'abundance', 'biomass_index',
       'abundance_index', 'avg_len', 'avg_mass', 'source', 'agency', 'season'],
      dtype='object')


In [6]:
fish_df = pd.concat([biomass_df, landings_df], ignore_index=True)
print(fish_df.tail())
print(fish_df.shape)
print(biomass_df.shape)
print(landings_df.shape)

      ecosystem    species  year  biomass  abundance  biomass_index  \
31017        CB  Skates_NS  2003      NaN        NaN           62.6   
31018        CB  Skates_NS  2005      NaN        NaN           38.7   
31019        CB  Skates_NS  2006      NaN        NaN           20.4   
31020        CB  Skates_NS  2007      NaN        NaN           68.4   
31021        CB  Skates_NS  2008      NaN        NaN          265.7   

       abundance_index  avg_len  avg_mass source agency season  
31017              NaN      NaN       NaN     nd     nd     nd  
31018              NaN      NaN       NaN     nd     nd     nd  
31019              NaN      NaN       NaN     nd     nd     nd  
31020              NaN      NaN       NaN     nd     nd     nd  
31021              NaN      NaN       NaN     nd     nd     nd  
(31022, 12)
(26137, 12)
(4885, 12)


In [51]:
sns.boxplot(x='species',y='biomass',data=fish_df)
plt.savefig('mygraph.png')

In [7]:
df_grouped = fish_df.groupby(by='species').describe()

In [43]:
df_grouped[df_grouped['biomass']['count']>1]['biomass']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alewife,125.0,0.008889,0.015133,0.000000,0.000660,0.002174,0.010226,0.083624
Alewife_and_Blueback_herring,47.0,0.113988,0.330209,0.001290,0.005570,0.012904,0.054777,1.942298
Alewife_and_River_herring,47.0,0.131931,0.162489,0.009031,0.028927,0.078045,0.185788,0.864734
American_butterfish,94.0,1.633931,2.474156,0.003828,0.072104,0.534661,2.287040,13.947598
American_eel,28.0,0.000390,0.000718,0.000000,0.000000,0.000000,0.000460,0.003062
...,...,...,...,...,...,...,...,...
Winter_flounder,570.0,1.097261,3.107601,0.000000,0.007251,0.056222,0.228627,26.789163
Winter_skate,122.0,0.008913,0.030120,0.000000,0.000000,0.000000,0.009445,0.302082
Witch_flounder,366.0,0.084340,0.201423,0.000016,0.002042,0.010188,0.044681,1.355565
Yellow_jack,14.0,0.000225,0.000254,0.000041,0.000065,0.000083,0.000347,0.000842


In [45]:
df_grouped[df_grouped['biomass_index']['count']>1]['biomass_index']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AFRICAN_POMPANO,9.0,0.004406,0.005221,0.000128,0.001299,0.002532,0.004810,0.016667
ALEWIFE,58.0,1.344170,2.032481,0.000128,0.070963,0.766315,1.405167,10.695333
AMERICAN_EEL,16.0,0.012215,0.014853,0.000127,0.001373,0.007523,0.017250,0.050000
AMERICAN_LOBSTER,66.0,0.172386,0.165221,0.004487,0.050669,0.126192,0.217821,0.774000
AMERICAN_SAND_LANCE,60.0,0.359361,0.718628,0.000385,0.007191,0.033077,0.335694,4.067342
...,...,...,...,...,...,...,...,...
Winter_skate,29.0,0.910360,2.499829,0.001250,0.002500,0.008750,0.615833,10.937500
YELLOWTAIL_FLOUNDER,28.0,0.057391,0.065300,0.000147,0.005090,0.032773,0.075699,0.236282
YELLOW_JACK,3.0,0.006311,0.007959,0.001389,0.001720,0.002051,0.008772,0.015493
Yellow_jack,17.0,3.358627,2.995818,0.632500,0.919167,2.208333,5.900000,8.803750


In [44]:
df_grouped[df_grouped['abundance']['count']>1]['abundance']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alewife,148.0,301.216633,694.925745,0.000000,26.579554,83.739146,233.526638,6092.022880
Alewife_and_Blueback_herring,47.0,355.626415,1030.204615,4.025834,17.378184,40.258337,170.896625,6059.684655
Alewife_and_River_herring,47.0,241.140567,296.993085,16.505913,52.872609,142.648705,339.579058,1580.541845
American_butterfish,94.0,4739.858396,7019.347353,12.345891,218.804009,1595.638720,6973.112502,38714.020446
American_eel,39.0,0.855181,1.787035,0.000000,0.000000,0.000000,0.941119,7.866394
...,...,...,...,...,...,...,...,...
Windowpane_flounder,52.0,1379.397184,1923.602544,124.569740,341.700286,611.089055,1606.945526,10109.170725
Winter_flounder,242.0,3429.825247,5241.551951,0.000000,451.479101,1578.364823,4137.275464,35028.102714
Winter_skate,141.0,9.304113,21.797446,0.000000,0.000000,0.000000,8.802543,177.140501
Yellow_jack,17.0,3.673120,3.912566,0.412619,0.825238,2.475715,4.538811,13.203815


In [46]:
df_grouped[df_grouped['abundance_index']['count']>1]['abundance_index']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AFRICAN_POMPANO,8.0,0.021450,0.015046,0.012658,0.012821,0.013064,0.025655,0.055833
ALEWIFE,54.0,21.789962,36.293860,0.012821,0.731074,9.713808,23.388426,221.710833
AMERICAN_EEL,16.0,0.039129,0.026731,0.012658,0.022993,0.033333,0.044872,0.101266
AMERICAN_LOBSTER,56.0,0.742649,1.059006,0.025641,0.199736,0.382491,0.877810,6.123309
AMERICAN_SAND_LANCE,60.0,38.784830,69.946856,0.025641,0.554532,5.051282,45.644824,295.780323
...,...,...,...,...,...,...,...,...
white_mullet,8.0,0.082387,0.076816,0.004739,0.022351,0.051658,0.145833,0.215686
white_perch,4.0,0.139729,0.106834,0.034286,0.055446,0.139093,0.223376,0.246445
windowpane_flounder,20.0,0.392496,0.393236,0.020833,0.100637,0.270621,0.453125,1.479167
winter_flounder,21.0,0.499630,1.242700,0.041667,0.125000,0.187500,0.291667,5.875000


In [49]:
df_grouped[df_grouped['avg_len']['count']>1]['avg_len']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alewife,72.0,21.359004,41.875662,0.080000,0.118588,0.150333,14.089167,223.250000
American_eel,13.0,310.113606,63.894455,185.051948,268.272727,332.088235,349.722222,395.600000
American_eel_age-1+,23.0,0.293109,0.031717,0.243100,0.262600,0.298600,0.319850,0.342700
American_lobster,64.0,0.654706,0.032740,0.593556,0.631921,0.653732,0.681172,0.724444
American_shad,46.0,35.237498,46.991434,0.085000,0.141250,0.206136,71.454389,173.833333
...,...,...,...,...,...,...,...,...
White_perch_age-1+,22.0,0.152932,0.006576,0.138900,0.148500,0.152150,0.157750,0.168300
Windowpane,63.0,0.220417,0.023413,0.148621,0.207040,0.218846,0.232579,0.280000
Winter_flounder,88.0,21.921224,36.322065,0.169333,0.215108,0.234311,61.797222,130.000000
Winter_skate,6.0,0.531258,0.092704,0.412000,0.457500,0.547357,0.598679,0.637500


In [50]:
df_grouped[df_grouped['avg_mass']['count']>1]['avg_mass']

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alewife,82.0,0.908722,1.967602,0.005867,0.020801,0.053136,0.467059,8.892308
American_eel,7.0,0.463590,0.350786,0.100000,0.216667,0.350000,0.651587,1.058621
American_lobster,110.0,0.392645,0.401283,0.100000,0.212181,0.261831,0.343716,2.415385
American_sand_lance,16.0,0.362373,0.361302,0.147368,0.183462,0.241377,0.325671,1.600000
American_sand_lance_yoy,9.0,0.154598,0.025991,0.112500,0.145000,0.154286,0.172727,0.195000
...,...,...,...,...,...,...,...,...
Windowpane_flounder,25.0,6.761834,5.141829,0.523529,2.400000,5.980172,9.717822,18.508772
Winter_flounder,116.0,2.876949,3.843480,0.039488,0.147563,0.278556,4.747619,15.240816
Winter_skate,35.0,2.038450,2.998286,0.100000,0.100000,0.175000,3.114687,10.937500
Yellow_jack,17.0,3.761670,2.768428,0.991358,1.553521,3.127500,5.974684,8.915190


In [8]:
csv_folder = r"C:\Users\natha\Desktop\bootcamp_repo-1\NW_Atlantic_Fishery_Sustainability\data\interim"
fish_df.to_csv(fr'{csv_folder}\fish_data.csv')