In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# sites is a list of locations (e.g., "Calhoun"),
# all_pits is a dict with sites as keys, and lists of pits 
# at each site as the values
all_pits = {}
sites = os.listdir('../../processed_data')

# Loop through all sites to construct dict of sites/pits
for site in sites:
    cur_pits = []
    for pro_file in os.listdir(os.path.join('../../processed_data/', site)):
        cur_pits.append(pro_file.split('_')[0])
        
    all_pits[site] = cur_pits

# Data will be a dict with pits as keys, np arrays as values
data = {}
# This is a list of all the features for R1C1. We'll want to make this an exhaustive list
# of all potential features across all pits
features = ['CO2', 'precip', 'SoilMoisture', 'BulkEC', 'Temp', 'O2', 'WaterPotential', 'ReductionPotential']
m = len(features)

# Load in and merge all files
# NOTE: Only doing Calhoun R1C1 for now, but you get the idea
i = 0
for site in all_pits.keys():
    for pit in all_pits[site]:
        
        if pit == 'R1C1':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'R1H1':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
       
    
        if pit == 'R1P1':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'LRMS':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'TMMS':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'NPMS':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'SPVF':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
          
      
        if pit == 'BGZOB1':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'BGZOB2':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
       
        if pit == 'BGZOB3':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'BGZOB4':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'Green1':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
        if pit == 'Green2':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'Green3':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'MC1':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'MC2':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
        if pit == 'MC3':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'MC4':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
        if pit == 'MC5':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()
            
        if pit == 'MC6':
            infile = os.path.join('../../processed_data', site, '%s_processed.csv' % pit)
            df = pd.read_csv(infile, parse_dates=[0], index_col=0, na_values=[-99999])
            depths = [col.split('_')[1].split('cm')[0] for col in df.columns]
            depths = [d for d in depths if d != 'precip.mm']
            unique_depths = list(set(depths))
            
            # t is the number of time steps
            t = df.shape[0]
            
            # data shape is the # of observations (# time points * depths), # features
            cur_data = np.empty((t*len(unique_depths), m), dtype=float)
            
            for i, depth in enumerate(unique_depths):
                depth_cols = [col for col in df.columns if '_%scm' % depth in col]
                for j, feature in enumerate(features):
                    # Look for columns with this feature and depth in the name
                    if feature == 'precip':
                        col = 'PRISM_precip.mm'
                    else:
                        matches = [col for col in depth_cols if feature in col]
                        
                        # Correct for fact that "O2" search returns "CO2" as well
                        if feature == 'O2':
                            matches = [col for col in matches if 'CO2' not in col]
                        
                        if len(matches) > 1:
                            raise ValueError("""More than one possible match found for %s %s %scm %s""" %(site, pit, depth, feature))
                        elif len(matches) == 1:
                            col = matches[0]
                        else:
                            continue

                    # Finally, now that we have the column name
                    cur_data[i*t:(i+1)*t, j] = df[col].values
                    
            data[pit] = cur_data.copy()

In [21]:
i=2
for site in all_pits.keys():
    for pit in all_pits[site]:
        print (pit)
print (unique_depths)
matches

R1C1
R1C2
R1H1
R1P1
BGZOB1
BGZOB2
BGZOB3
BGZOB4
Green1
Green2
Green3
Green
MC1
MC2
MC3
MC4
MC5
MC6
SFPit1
LRMS
NPMS
SPMS
SPVF
TMMS
['5', '10', '100', '60', '30']


['ApogeeTemp_10cm.deg.C', 'DecagonMPS6Temp_10cm.deg.C']

In [35]:
#df1=pd.DataFrame(data, index=['A'])
data

{'R1C1': array([[ 0., nan,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ...,
        [nan, nan,  0., ..., nan,  0.,  0.],
        [nan, nan,  0., ..., nan,  0.,  0.],
        [nan, nan,  0., ..., nan,  0.,  0.]]),
 'R1H1': array([[0.0000e+00,        nan, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00,        nan, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00,        nan, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [3.3925e+04,        nan, 0.0000e+00, ..., 1.6040e+01, 0.0000e+00,
         0.0000e+00],
        [3.3920e+04,        nan, 0.0000e+00, ..., 1.6040e+01, 0.0000e+00,
         0.0000e+00],
        [3.3950e+04,        nan, 0.0000e+00, ..., 1.6040e+01, 0.0000e+00,
         0.0000e+00]]),
 'R1P1': array([[ 0., nan,  0., ...,  0.,  0.,  0.],
        [ 0., nan,  0., ...,  0.,  0.,  0.],
        [ 

In the above, we'll need to make sure all the data is in the same units, across all sites

Basically, the goal is to build up a numpy array where each row is an observation (an individual site/depth/time; e.g., Calhoun R1C1 on January 1st at 12:15 pm) and each column is a parameter (e.g., CO2, Soil moisture, O2, etc). We'll have lots of NaN values, and that's fine for now.

What I would recommend doing is building up one site/pit at a time, then combining arrays later on. For example, the R1C1 array should look like:

|     |CO2|precip|SoilMoisture|BulkEC|Temp|O2|
|---|---|---|---|---|---|---|
|12/12/20 12:15 pm @ 25 cm | 5000 ppm | 0 | 0.24 | NaN | 4.37 | 19.1 |
|12/12/20 12:30 pm @ 25 cm | 5169 ppm | 0 | 0.26 | NaN | 4.45 | 19.2 |
|12/12/20 12:45 pm @ 25 cm | 5120 ppm | 0 | 0.29 | NaN | 4.42 | 19.1 |
|12/12/20 01:00 pm @ 25 cm | 5148 ppm | 0 | 0.26 | NaN | 4.49 | 19.2 |
| ... | ... | ... | ... | ... | ... | ... |
|07/04/18 09:15 am @ 150 cm | 6952 ppm | 0.01 | 0.39 | NaN | 4.3 | 19.1 |
| ... | ... | ... | ... | ... | ... | ... |
| etc | etc | etc | etc | etc | etc | etc |

Except without the columns or index labels. You could also set it up as a pandas dataframe (ie, with column and index labels) then extract the values later on. Whichever is easier.

In [14]:
output=cur_data.copy()
outfile = '../../processed_data/MergeProcessedData.csv'
output.to_csv(outfile, na_rep=-99999, index_label='DateTime.MST')

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'

In [15]:
outfile = '../../processed_data/MergeProcessedData.csv'

np.savetxt(outfile, output, delimiter=",")


In [16]:
pd.DataFrame(output).to_csv('../../processed_data/MergeProcessedData.csv')


In [10]:
output

{'R1C1': array([[ 0., nan,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.],
        ...,
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan]]),
 'R1H1': array([[0.0000e+00,        nan, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00,        nan, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00,        nan, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [1.0030e+04,        nan, 2.1000e-01, 1.7000e-02, 2.2870e+01,
         1.7470e+01],
        [1.0025e+04,        nan, 2.1000e-01, 1.7000e-02, 2.2910e+01,
         1.7470e+01],
        [1.0110e+04,        nan, 2.1000e-01, 1.7000e-02, 2.2930e+01,
         1.7470e+01]])}