In [31]:
import pandas as pd
import numpy as np
import pathlib
from pandas.errors import ParserError
from tqdm import tqdm_notebook
from collections import namedtuple

In [2]:
home = pathlib.Path.home()

In [3]:
dataDir = home / "DATA/Beaching/CensusData"

In [32]:
def ParseFile(filepath, beginYear='1999', endYear='2014'):
    try:
        df = pd.read_csv(filepath, header=0, parse_dates=['obs_date'],
                        usecols=np.arange(32), infer_datetime_format=True)
    except ParserError as e:
        print(f'problem parsing {str(filepath)}')
        print(e)
        return None
    
    cols_to_drop=['geom', 'oid', 'id', 'dataset_id', 'series_id', 'tsn', 'sp_code',
                  'provider', 'owner', 'share_policy', 'tprecision', 'lprecision',
                  'depth', 'date_time', 'last_mod', 'oceano', 'resources', 'ds_type',
                  'platform', 'classitem', 'symbol', 'symbolsize', 'datetime_end_utc',
                  'common', 'pam_granularity', 'pam_bin_size']
                  #, 'pam_call_type',
                  #  'publish', 'scientific']
    critter_abrev = str(filepath).split('/')[-1].split('_')[0][:5]
    df.drop(cols_to_drop, axis=1, inplace=True)
    df.drop_duplicates(inplace=True)
    df.set_index('obs_date', inplace=True)
    df.sort_index(inplace=True)
    df = df.loc[beginYear: endYear]
    pickle_name = f'../PklJar/{critter_abrev}_global_census.pkl'
    df.to_pickle(pickle_name)
    return pickle_name

def LoadParseData():
    files = [f for f in dataDir.glob('*.csv')]
    pkl_name_dict = {}
    for file in tqdm_notebook(files):
        fname = str(file).split('/')[-1]
        print(fname)
        pkl_name = ParseFile(file)
        pkl_name_dict[fname.split('.')[0]] = pkl_name
    return pkl_name_dict

def GetRegionalData(path_dict, lat_lon_lims):
    """
    path_dict: dictionary containing a critter-to-dataframe_pickle_path mapping
    lat_lon_lims: namedtuple with lat/lon min/max
    """
    
    for critter,pkl_file_name in path_dict.items():
        with open(pkl_file_name, 'rb') as f:
            df = pd.read_pickle(pkl_file_name)
            # create new column with distance from cape cod
            # Approx. CC_lat/_lon = 41.795275 / -70.368343
            df['dist_2_cape_cod'] = distance((41.795275, -70.368343), df)

In [33]:
pkl_dict  = LoadParseData()

Globiceph_census.csv
Grampus_census.csv
Stenella_census.csv
Lageno_census.csv
Tursiops_census.csv
Delphinus_census.csv



In [27]:
pkl_dict

{'Delphinus_census': '../PklJar/Delph_global_census.pkl',
 'Globiceph_census': '../PklJar/Globi_global_census.pkl',
 'Grampus_census': '../PklJar/Gramp_global_census.pkl',
 'Lageno_census': '../PklJar/Lagen_global_census.pkl',
 'Stenella_census': '../PklJar/Stene_global_census.pkl',
 'Tursiops_census': '../PklJar/Tursi_global_census.pkl'}

In [34]:
df = pd.read_pickle(pkl_dict['Delphinus_census'])

In [36]:
lat_lon_range = namedtuple('lat_lon_range', ['lat_range', 'lon_range'])

In [45]:
lat_lon_range.lat_range=[40, 43]
lat_lon_range.lon_range=[-71, -66]

In [46]:
lat_lon_range.lat_range

[40, 43]

In [55]:
df_cape_cod = df.loc[(df.Delph_latitude >= lat_lon_range.lat_range[0])
                     & (df.Delph_latitude<=lat_lon_range.lat_range[1])
                    & (df.Delph_longitude>=lat_lon_range.lon_range[1])]

In [56]:
df_cape_cod.shape

(1300, 5)

In [42]:
df.shape

(14630, 5)

In [44]:
df.Delph_longitude.min()

-130.72

In [None]:
df.loc

In [35]:
df.loc[(df.Delph_latitude.isin(lat_lon_range.lat_range))

Unnamed: 0_level_0,Delph_row_id,Delph_scientific,Delph_latitude,Delph_longitude,Delph_count
obs_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1999-01-06,1406_8261,Delphinus delphis,44.72561,-1.24422,
1999-01-07,731_7247,Delphinus delphis,50.038124,-5.261319,1.0
1999-01-07,731_7239,Delphinus delphis,50.36701,-4.639627,1.0
1999-01-08,427_859885,Delphinus delphis,62.097,-1.724,15.0
1999-01-08,731_7241,Delphinus delphis,51.101899,-4.2084,1.0
