# Extracting MODIS data at sonar pings

Now we can extract sea surface temperature and chlorophyll a concentration from MODIS data at the spatial locations and months of sonar pings. 

In [1]:
import pandas as pd
import datetime
import numpy as np
import os
import rasterio
import multiprocessing
import re

In [2]:
def find_month(dataframe):
    """
    Find month(s) contained in a sonar dataframe
    (assumes one unique date)

    Args:
    - 'dataframe' a pandas dataframe with column "Ping_date"

    Returns:
    - month (int) numeric value representing the month data were collected
    """
    date_string = dataframe.Ping_date.unique()[0].strip()
    date = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    month = date.month
    return month

In [3]:
def get_raster_path(dataframe, var):
    """
    Construct a path to a raster file

    Args: 
    - dataframe: a pandas dataframe with column "Ping_date"
    - var: (string) one of "Chlor" or "SST"
    """
    month = find_month(dataframe)
    month_dict = {
        '5': 'may', 
        '6': 'jun', 
        '7': 'jul', 
        '8': 'aug', 
        '9': 'sep'
    }
    assert var == 'chlor_a' or var == 'sst'
    fname = month_dict[str(month)] + "_2013_" + var + '.nc'
    path = os.path.join('data', 'modis', fname)
    return path

In [4]:
def extract_raster(dataframe, path, newcol):
    """
    Extract values from raster and add as dataframe column

    Args: 
    - 'dataframe': a pandas dataframe with "Longitude" and "Latitude"
    - 'path': a file path to a raster with values to extract
    - 'newcol': (string) new column name for extracted values

    Returns: 
    - dataframe with a new column (newcol) containining raster values
    """
    fullpath = 'NETCDF:' + path + ":" + newcol
    with rasterio.open(fullpath) as src:
        x = dataframe.Longitude.values
        y = dataframe.Latitude.values
        vals = [v for v in src.sample(zip(x, y))]
        vals = np.concatenate(vals, axis=0).astype(np.float32)
        vals[vals == -32767.0] = np.nan    # -32767 is a missing data value
        dataframe[newcol] = vals
    return dataframe

In [5]:
def augment_df(path, variables = ["sst", "chlor_a"]):
    """
    Augment dataframe with chlorophyll and SST columns

    Args:
    - path: path to CSV file with columns "Longitude" and "Latitude"
    - variables: (list) with elements "Chlor" and/or "SST"

    Returns:
    - a dataframe with Chlor and/or SST columns
    """
    dataframe = pd.read_csv(path)
    for i in range(len(variables)):        
        path = get_raster_path(dataframe, variables[i])
        dataframe = extract_raster(dataframe, path, variables[i])
    dataframe['sst'] *= 0.0049999999
    dataframe.drop(columns=['Unnamed: 0'], inplace=True)
    return dataframe

Having defined some helper functions, we can generate a list of paths to sonar CSV files:

In [6]:
csv_files = os.listdir(os.path.join('data', 'summaries'))
csv_files.sort()
full_csv_paths = [os.path.join('data', 'summaries', f) for f in csv_files]

Next, we can use `multiprocessing` to extract the SST and chlorophyll a data for each sonar CSV file in parallel:

In [7]:
with multiprocessing.Pool() as pool:
    out = pd.concat(list(pool.map(augment_df, full_csv_paths)))

Let's peek at the shape and first rows of the final output (one big data frame):

In [8]:
out.shape

(12012632, 11)

In [9]:
out.head()

Unnamed: 0,Latitude,Longitude,Ping_date,Ping_time,Ping_milliseconds,wavelength,mean,potential_counts,return_counts,sst,chlor_a
0,47.622585,-122.406807,2013-05-22,15:01:24,948.0,Sv_120,-67.359286,1315,41.0,,
1,47.622585,-122.406807,2013-05-22,15:01:24,948.0,Sv_38,-66.622249,1315,11.0,,
2,47.622585,-122.406784,2013-05-22,15:01:23,448.0,Sv_120,-66.714195,1315,41.0,,
3,47.622585,-122.406784,2013-05-22,15:01:23,448.0,Sv_38,-65.572092,1315,19.0,,
4,47.622585,-122.406754,2013-05-22,15:01:21,948.0,Sv_120,-66.179431,1315,37.0,,


And last, we will save this as a CSV file.

In [10]:
out.to_csv(os.path.join('data', 'sonar-with-modis.csv'))