In [88]:
# load packages
import numpy as np
from IPython.display import SVG
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow

%matplotlib inline
from pathlib import Path
import pandas as pd
import pickle
import glob

repo = Path('../')
datadir = Path('../raw_processed_data')

#Import the dataframe
df = pd.read_pickle(datadir.joinpath('raw_processed_validated.pkl'))

### Create input data matrix

shape of $data$: (m,n_channel,n_adj*2+1,37)


In [89]:
def create_data_matrix_byline(df,n_adj=1,include_std=True):
    lm_std_cols = [c for c in df.columns if c.startswith('DBDT_STD_Ch1')]
    lm_cols = [c for c in df.columns if c.startswith('DBDT_Ch1')]
    hm_std_cols = [c for c in df.columns if c.startswith('DBDT_STD_Ch2')]
    hm_cols = [c for c in df.columns if c.startswith('DBDT_Ch2')]
    if include_std:
        n_channel=4
    else:
        n_channel=2

    #initialize data and label arrays by looping through all lines
    m_all=0
    for line in df.LINE_NO.unique():
        all_rows = np.arange(n_adj*2,len(df.loc[df.LINE_NO==line])-n_adj*2,2)
        m_all+=len(all_rows)
    data = np.zeros((m_all,n_channel,n_adj*2+1,37),dtype=float)
    labels= np.zeros(m_all,dtype=bool)
    
    i=0
    for line in df.LINE_NO.unique():
        df_line = df.loc[df.LINE_NO==line]        
        all_rows = np.arange(n_adj*2,len(df_line)-n_adj*2,2)
        m = len(all_rows)
        print('Line no: {}\t iterations: {}'.format(line,m))
        for row in all_rows:
#             if i%1000==0:
#                 print('iteration {} of {}'.format(i,m))
            #get the correct soundings
            df_tmp = df.loc[row-n_adj*2:row+n_adj*2+1]
            HMind = df_tmp.index[df_tmp.CHANNEL_NO==2]
            LMind = HMind+1
        
            #Create data matrix
            try:
                lm_data_zero_padded = np.c_[df_tmp.loc[LMind,lm_cols].values,
                                            9999.0*np.ones((n_adj*2+1,9))]
            except:
                print('iteration {} \n'
                      'rows {} \n'
                      'HMind {} \n'
                      'LMind {} \n'
                      'lm_cols {} \n'.format(i,row,HMind,LMind,lm_cols))
                break
            hm_data = df_tmp.loc[HMind,hm_cols].values

            if include_std:
                lm_std_zero_padded = np.c_[df_tmp.loc[LMind,lm_std_cols].values,
                                            9999.0*np.ones((n_adj*2+1,9))]
                hm_std = df_tmp.loc[HMind,hm_std_cols].values
                data[i] = np.stack((hm_data,lm_data_zero_padded,hm_std,lm_std_zero_padded))
            else:
                data[i] = np.stack((hm_data,lm_data_zero_padded))
            labels[i] = df.loc[row].VALID
            i+=1
    return data,labels

### Create and save a data matrix for 1-4 adjacent soundings

In [91]:
include_std=True
for n_adj in np.arange(1,5):
    print('creating data for n_adj={}'.format(n_adj))
    data,labels = create_data_matrix_byline(df,n_adj=n_adj,include_std=include_std)
    np.save(datadir.joinpath('data_nadj{}_std{}.npy'.format(n_adj,include_std)),data)
    np.save(datadir.joinpath('labels_nadj{}_std{}.npy'.format(n_adj,include_std)),labels)


creating data for n_adj=1
Line no: 100101	 iterations: 161
Line no: 100201	 iterations: 192
Line no: 100701	 iterations: 316
Line no: 100801	 iterations: 104
Line no: 100202	 iterations: 118
Line no: 100702	 iterations: 73
Line no: 100802	 iterations: 129
Line no: 100601	 iterations: 183
Line no: 100501	 iterations: 195
Line no: 100401	 iterations: 147
Line no: 200501	 iterations: 462
Line no: 200401	 iterations: 247
Line no: 100301	 iterations: 192
Line no: 200601	 iterations: 422
Line no: 710501	 iterations: 423
Line no: 710401	 iterations: 674
Line no: 710301	 iterations: 948
Line no: 101501	 iterations: 603
Line no: 200101	 iterations: 66
Line no: 101601	 iterations: 271
Line no: 101602	 iterations: 288
Line no: 101702	 iterations: 277
Line no: 101701	 iterations: 199
Line no: 101801	 iterations: 541
Line no: 101902	 iterations: 304
Line no: 102002	 iterations: 338
Line no: 200801	 iterations: 255
Line no: 710601	 iterations: 872
Line no: 102601	 iterations: 564
Line no: 102501	 it

In [97]:
for n_adj in range(1,5):
    print((np.load(datadir.parent.joinpath('raw_processed_data_Kaweah','data_nadj{}_std{}.npy'.format(n_adj,include_std)))).shape)


(24781, 4, 3, 37)
(24695, 4, 5, 37)
(24609, 4, 7, 37)
(24523, 4, 9, 37)
