# Code to convert, backup and log experiments

* Requires MCDS commandline conversion tool
* Requires GCE commandline tools and setup for account with access to storage


In [5]:
import os
import pandas as pd
import xlsxwriter
import logging
import subprocess
from google.cloud import storage

gcs_client = storage.Client(project='divine-builder-142611')
bucket = gcs_client.get_bucket('meadata')

logpath = '/home/mestalbet/'
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# create a file handler
handler = logging.FileHandler(os.path.join(logpath,'SpreadSheet_RunLog.log'))
handler.setLevel(logging.INFO)

# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)

# add the handlers to the logger
logger.addHandler(handler)

In [7]:
def walk(d,suffix = '.msds'):
    pys = []
    for p, d, f in os.walk(d):
        for file in f:
            if file.endswith(suffix):
                    pys.append(os.path.join(file))
    return pys

def scanforexperiments(searchpath,logpath):
    '''
    Takes a search path and looks recursively for all MSDS files
    '''
    logger.info('Start: Scan for new files')
    files = walk(searchpath,'.msds')
    logger.info('Process: %d _total_ files found' % len(files))
    
    # Checks if this was run in the past, if so, only processes the new files
    oldfileslog = os.path.join(logpath,'filelist.p')
    if os.path.isfile(oldfileslog):
        pickle_in = open(oldfileslog,"rb")
        oldfilelist = pickle.load(pickle_in)
        pickle_in.close()
        logger.info('Process: %d _old_ files found' % len(oldfileslist))
    
    pickle_out = open(oldfileslog,"wb")
    pickle(files, pickle_out)
    pickle_out.close()
    logger.info('Process: Old files list updated')
    
    files  = list(set(files) & set(oldfileslist))
    numfiles = len(files)
    logger.info('Process: %d _new_ files found' % numfiles)
    logger.info('End: Scan for files')
    return(files,numfiles)

In [11]:
def parsefields(files, splitchar = '<'):
    text = [splitchar,'h5']
    words = [f.split('/') for f in files]
    keys =[]
    values =[] 
    field = {}
    for word in words:
        key = 'exclude'
        value = ''
        field.setdefault(key, []).append(value)
        field.setdefault(key, []).append(value)
        key = 'folder'
        value = '/'+os.path.join(*word[:-1])
        field.setdefault(key, []).append(value)
        field.setdefault(key, []).append(value)
        key = 'coating'
        value = 'PDL'
        field.setdefault(key, []).append(value)
        field.setdefault(key, []).append(value)
        key = 'cleaning'
        value = 'Liquinox, DDW'
        field.setdefault(key, []).append(value)
        field.setdefault(key, []).append(value)
        for w in word:
            if text[0] in w:
                innerwords = w.split('_')
                for innerword in innerwords:
                    key =  innerword.split(splitchar)[0]
                    value = innerword.split(splitchar)[1]
                    field.setdefault(key, []).append(value)   
                    field.setdefault(key, []).append(value)
            elif text[1] in w:
                key = 'MEAfiles'
                value = w
                field.setdefault(key, []).append(value) 
                field.setdefault(key, []).append(value.replace('h5','bin')) 
                key = 'recFormat'
                value = "MCH5Recording"
                field.setdefault(key, []).append(value)
                field.setdefault(key, []).append(value.replace('MCH5','binary')) 
                key = 'recNames'
                value = w
                field.setdefault(key, []).append(value)
                field.setdefault(key, []).append(value.replace('h5','bin')) 
        key = 'comments'
        value = ''
        field.setdefault(key, []).append(value) 
        return(field)

def makeGSdir(bucket = bucket,directory = 'database/'):
    blob = bucket.blob(directory)
    blob.upload_from_string('', content_type='application/x-www-form-urlencoded;charset=UTF-8')
    
def createDirStructure(field):
    table = pd.DataFrame(field)
    t = table.groupby(['MEAfiles']).agg(['first'])
    for r in t.rows:
        directory = 'database'
        for c in r.cols:
            directory = os.path.join(directory,c+'=%s' % t.loc[c].value)
            makeGSdir(directory)

def converttoh5(files, fields, gspath, mcspath = '~/MCS_CommandLineTool/'):
    '''
    Takes files that were discovered and converts them to h5 then uploads them to google cloud
    '''
    logger.info('Start: Conversion from MSDS to H5')
    for i in range(len(numfiles)):
        bashCommand = "%sMcsDataCommandLineConverter.exe -t hdf5 %s" % (mcspath, files[i])
        process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
        output, error = process.communicate()
        if error is not None:
            logger.error('File failed to convert with error: \n %s' % error)
                
        logger.info('Process: Created directory structure on GS from file name %d / %d' % (i+1,numfiles))
        filenames = [f.split('/')[-1].split('.')[0] for f in files]
        fields = [f.split('_') for f in filenames]
        logger.info('Process: Successfully converted file to H5')
        logger.info('Process: Succesfully uploaded H5 to GS')

    logger.info('End: Conversion from MSDS to H5')
    return(h5files)

In [None]:
def writexls(path, d):
    workbook = xlsxwriter.Workbook(path)
    worksheet = workbook.add_worksheet()
    col = 0 
    for key in d.keys():
        row = 0
        worksheet.write(row, col, key)
        for item in d[key]:
            row += 1
            worksheet.write(row, col, item)
        col += 1
    workbook.close()
    return
    
def write_excel(gspath, field):
    '''
    Takes list of h5file paths and creates excel sheet. Saves to GS. 
    '''
    logger.info('Start: Write Excel file log')

    logger.info('Process: Successfully parsed directory structure')
    logger.info('Process: Successfully updated Excel Sheet')
    logger.info('Process: Succesfully copied log file and excel file to GS')


    logger.info('End: Write Excel file log')
    return