In [24]:
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


# global variables go here:


# Metadata support for dataframes:
# (following    https://github.com/pandas-dev/pandas/issues/2485   and
#      http://pandas.pydata.org/pandas-docs/stable/internals.html#override-constructor-properties    )
# (consider adding h5 storage, http://stackoverflow.com/questions/29129095/save-additional-attributes-in-pandas-dataframe)

# [TODO]

# Filepath parsing:
from experimentdataanalysis.parsing.scandataparsing \
    import analyze_scan_filepath, analyze_string_for_dict_pairs
this_element_keyword_list = [("TRKR", "IsTRKR?", True),
                             ("RSA", "IsRSA?", True)]
next_element_keyword_list = [("Ind", "SecondScanIndex"),
                             ("2Dscan", ["SecondScanType", "FirstScanType"])]
in_this_element_keyword_list = [("Vcm", "Electric Field (V/cm)"),
                                ("mT", "Magnetic Field (mT)"),
                                ("K", "Set Temperature (K)"),
                                ("nm", "Wavelength (nm)"),
                                ("ps", "Delay Time (ps)"),
                                ("run", "RunIndex"),
                                ("V", "Voltage (V)"),
                                ("x", "SecondScanCoord"),
                                ("uWpump", "Pump Power (uW)"),
                                ("uWprobe", "Probe Power (uW)"),
                                ("TestPhaseShift", "TestPhaseShift Experiment #"),
                                ("repeats", "# repeats"),
                               ]
parsing_keyword_lists = [this_element_keyword_list,
                         next_element_keyword_list,
                         in_this_element_keyword_list]
def analyze_filepath(filepath, existing_info_dict=None):
    if existing_info_dict is None:
        info_dict = {}
    info_dict = analyze_scan_filepath(filepath, existing_info_dict, parsing_keyword_lists)
    if {'SecondScanType', 'SecondScanCoord'}.issubset(info_dict.keys()):
        info_dict[info_dict['SecondScanType']] = info_dict['SecondScanCoord']
    if 'BExternal' in info_dict.keys():
        info_dict['Magnetic Field (mT)'] = info_dict['BExternal']
    return info_dict


In [30]:
# REQUIRED: directory containing 2D scans
parent_dir = ('C:\\Data\\apr2017\\170411\\')

# REQUIRED: data storage format information
num_headerlines = 4

# OPTIONAL: info_dict-based filtering, fcns returning False if data should be ignored
filter_fcns = [lambda info_dict: info_dict.get("TestPhaseShift Experiment #") == 5,
               lambda info_dict: info_dict.get("# repeats", 1) <= 2,
               lambda info_dict: "alignment" not in info_dict.get("Filepath"),
              ]

# OPTIONAL: specification of specific columns in data file as x-coords, y-coords. 
#           default: X = 1st column in file, Y = 2nd column in file. 
#                    (or in terms of resulting DataFrame, X = index, Y = 1st column)
data_xfield = None
data_yfield = 'lockin1x'

# OPTIONAL: specify a scalar value to be used as the measurement error of given y-values, i.e., their "error bars"
data_fixed_uncertainty = None

# OPTIONAL: info tags to add as columns
info_tag_to_column_list = [('Magnetic Field (mT)', 'b_external'),
                           ('BExternal', 'b_external'),
                           ('Run ID', 'run_id'),
                           ('SecondScanIndex', 'index_in_run')]
time_elapsed_per_delay_scan = 8.0
time_elapsed_per_delay_pos = 1.0
column_expression_list = ['index_in_scan = index',
                          'time_elapsed = ' +
                              '@time_elapsed_per_delay_scan * index_in_run + @time_elapsed_per_delay_pos * index',
                         ]

# PROCESSING:
target_filepaths = []
for dirpath, dirnames, filenames in os.walk(parent_dir):
    for filename in filenames:
        if '.dat' in filename:
            target_filepaths.append(os.path.join(dirpath, filename))

file_dataframes_list = []
dataframes_list = []
last_dir = ''
dir_counter = -1
for filepath in target_filepaths:
    with open(filepath) as file_lines_iterator:
        header_lines = [next(file_lines_iterator) for line in range(num_headerlines)]
        file_dataframe = pd.read_csv(filepath_or_buffer=file_lines_iterator,
                                     header=0,
                                     index_col=False,  # no auto-index, needed when tabs at end of each line
                                     sep='\t',
                                     skiprows=0)
    column_names = file_dataframe.columns.values.tolist()
    if data_xfield is None:
        data_xfield = column_names[0]
    if data_yfield is None:
        data_yfield = column_names[1]

# col 1 -> index col, extract col 2 & new index column as series:
#    file_dataframe.set_index(data_xfield, drop=False, inplace=True)
#    if data_yfield is not None:
#        file_series = file_dataframe.loc[:, data_yfield]
#    else:
#        file_series = file_dataframe.iloc[:, 0]

    # tag on a custom attribute to keep track of metadata
    file_info_dict = analyze_filepath(filepath)
    analyze_string_for_dict_pairs(''.join(header_lines), file_info_dict)
    current_dir = filepath.split('\\')[-2]
    if current_dir != last_dir:
        last_dir = current_dir
        dir_counter += 1
    file_info_dict['Run ID'] = dir_counter

    # if not filtered, create new dataframe based on user specifications:
    if not all([filter_fcn(file_info_dict) for filter_fcn in filter_fcns]):
        continue
    else:
        new_dataframe = pd.DataFrame(file_dataframe,
                                     columns=[data_xfield, data_yfield])
        for info_tag, column_name in info_tag_to_column_list:
            if info_tag in file_info_dict:
                new_dataframe[column_name] = file_info_dict[info_tag]
        for expression in column_expression_list:
            new_dataframe.eval(expression, inplace=True)

    # add new dataframe to list
    file_dataframe.info_dict = file_info_dict
    new_dataframe.info_dict = file_info_dict.copy()
    file_dataframes_list.append(file_dataframe)
    dataframes_list.append(new_dataframe)

    

In [31]:
file_dataframes_list[0]

Unnamed: 0,scancoord,lockin2x,lockin1x,lockin2r,lockin1r,laserpower,cwetalon,lasercomponent1,lasercomponent2,temperature
0,-500.1108,9e-05,0.00092,9e-05,0.00092,0.691,-0.173,0.393,0.001,10.003
1,-400.1108,4.5e-05,-0.00026,5e-05,0.00026,0.688,-0.23,0.395,-0.382,10.003
2,-300.1108,1e-05,-0.00132,2e-05,0.0013,0.682,-0.175,0.001,-0.376,10.005
3,-200.1108,-5e-06,-0.00186,1.5e-05,0.00186,0.693,-0.175,0.394,0.001,10.008
4,-100.1108,2e-05,-0.00144,2e-05,0.00144,0.678,-0.193,0.001,-0.005,10.002


In [32]:
dataframes_list[0]

Unnamed: 0,scancoord,lockin1x,b_external,run_id,index_in_run,index_in_scan,time_elapsed
0,-500.1108,0.00092,0.204,15,10.0,0,80.0
1,-400.1108,-0.00026,0.204,15,10.0,1,81.0
2,-300.1108,-0.00132,0.204,15,10.0,2,82.0
3,-200.1108,-0.00186,0.204,15,10.0,3,83.0
4,-100.1108,-0.00144,0.204,15,10.0,4,84.0


In [33]:
giant_dataframe = pd.concat(dataframes_list)
giant_dataframe.head(12)

Unnamed: 0,scancoord,lockin1x,b_external,run_id,index_in_run,index_in_scan,time_elapsed
0,-500.1108,0.00092,0.204,15,10.0,0,80.0
1,-400.1108,-0.00026,0.204,15,10.0,1,81.0
2,-300.1108,-0.00132,0.204,15,10.0,2,82.0
3,-200.1108,-0.00186,0.204,15,10.0,3,83.0
4,-100.1108,-0.00144,0.204,15,10.0,4,84.0
0,-500.1108,0.00272,0.203,15,11.0,0,88.0
1,-400.1108,0.0022,0.203,15,11.0,1,89.0
2,-300.1108,0.00054,0.203,15,11.0,2,90.0
3,-200.1108,-0.00166,0.203,15,11.0,3,91.0
4,-100.1108,-0.00282,0.203,15,11.0,4,92.0


In [95]:
TRKR_fit_dataframe = giant_dataframe.query('scancoord > 1.0 or scancoord < -1.0')
TRKR_fit_dataframe.rename(columns={'scancoord': 'probe_delay'}, inplace=True)
TRKR_fit_dataframe.set_index(['run_id', 'index_in_run', 'index_in_scan'], drop=True, append=False, inplace=True)
TRKR_fit_dataframe.sort_index(ascending=True, inplace=True)
TRKR_fit_dataframe.tail(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,probe_delay,lockin1x,b_external,time_elapsed
run_id,index_in_run,index_in_scan,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
31,73.0,0,-500.1662,0.0027,0.201,584.0
31,73.0,1,-400.1662,-8e-05,0.201,585.0
31,73.0,2,-300.1662,-0.00238,0.201,586.0
31,73.0,3,-200.1662,-0.00258,0.201,587.0
31,73.0,4,-100.1662,-0.00064,0.201,588.0
31,74.0,0,-500.1662,0.00524,0.2,592.0
31,74.0,1,-400.1662,0.00242,0.2,593.0
31,74.0,2,-300.1662,-0.00098,0.2,594.0
31,74.0,3,-200.1662,-0.0028,0.2,595.0
31,74.0,4,-100.1662,-0.0025,0.2,596.0


In [98]:
grouping = TRKR_fit_dataframe.groupby(level=['run_id', 'index_in_run'])
grouping.count()
x`

Unnamed: 0_level_0,Unnamed: 1_level_0,probe_delay,lockin1x,b_external,time_elapsed
run_id,index_in_run,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
15,1.0,5,5,5,5
15,2.0,5,5,5,5
15,3.0,5,5,5,5
15,4.0,5,5,5,5
15,5.0,5,5,5,5
15,6.0,5,5,5,5
15,7.0,5,5,5,5
15,8.0,5,5,5,5
15,9.0,5,5,5,5
15,10.0,5,5,5,5


In [55]:
slow_RSA_dataframe = giant_dataframe.query('scancoord > 1.0 or scancoord < -1.0')  # can't be b-field, ensures TRKR
slow_RSA_dataframe.rename(columns={'scancoord': 'probe_delay'}, inplace=True)
slow_RSA_dataframe.set_index(['run_id', 'index_in_scan', 'index_in_run'], drop=False, append=False, inplace=True)
slow_RSA_dataframe.sort_index(ascending=True, inplace=True)
slow_RSA_dataframe.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,probe_delay,lockin1x,b_external,run_id,index_in_run,index_in_scan,time_elapsed
run_id,index_in_scan,index_in_run,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15,0,1.0,-500.1108,0.00062,-0.3,15,1.0,0,8.0
15,0,2.0,-500.1108,0.00062,0.0,15,2.0,0,16.0
15,0,3.0,-500.1108,0.00062,0.3,15,3.0,0,24.0
15,0,4.0,-500.1108,-0.0008,0.21,15,4.0,0,32.0
15,0,5.0,-500.1108,-0.00066,0.209,15,5.0,0,40.0
15,0,6.0,-500.1108,-0.00062,0.208,15,6.0,0,48.0
15,0,7.0,-500.1108,-0.00038,0.207,15,7.0,0,56.0
15,0,8.0,-500.1108,-0.00028,0.206,15,8.0,0,64.0
15,0,9.0,-500.1108,4e-05,0.205,15,9.0,0,72.0
15,0,10.0,-500.1108,0.00092,0.204,15,10.0,0,80.0


In [None]:
file_dataframes_list[0].info_dict