In [5]:
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import math
import openpyxl
from statistics import * 

In [6]:
#This code will parse through the Peak_ID worksheet and generate another worksheet in the same file called "External_STD"
#with concentration and peak area data for C7, C8, ..., C17 arranged like so:
#
# Chain Length | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area
# _____________________________________________________________________________________________________________________
#      C7      |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      C8      |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      ...     |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      C17     |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#

In [7]:
def getExternalStdData(file, sheet, area_col, chain_col, standards_used, alc_acid_ID, saveAs):
    col = [area_col, chain_col]
    wb = openpyxl.load_workbook(file)
    sheet_name = wb[sheet]
    #create dataframe from relevant columns only
    df = pd.read_excel(file, sheet_name=sheet, usecols = col, header = None)
    print(df)
    #convert dataframe to array to allow array traversals
    dfList = df.values.tolist()
    #Master_dict will hold area and concentration data 
    Master_dict = standards_used

    # creates list of external standard title indicies (location of titles)
    alc_acid_indicies = []
    for i in range(len(dfList)):
        name = dfList[i][1]
        if name[0:name.find('-')] == alc_acid_ID:
            alc_acid_indicies.append(i)
    
    conc_array = []
    titleTypes = ['Peak#', 'R.Time', 'I.Time', 'F.Time', 'Area', 'Height']
    # start adding to master matrix here using indicies of titles
    for i in alc_acid_indicies:
        #find the position right before the start of a new set of data as indicated by a string
        endindex = i
        for k in range(len(dfList[i:len(dfList)])):
            endindex+=1
            print(endindex)
            testVar = dfList[endindex][0]
            print(testVar)
            if(isinstance(testVar,str) and testVar not in titleTypes):
                break
        #parse title to find concentration (gets the number after '-' in the title) and add to conc_array and sort
        conc = dfList[i][1][dfList[i][1].find('-')+1:len(dfList[i][1])]
        conc_array.append(int(conc))
        conc_array.sort()
        #for each chain, add the concentration and area data to the Master dictionary
        for j in range(endindex-i):
            if(isinstance(dfList[i+j][0], int)):
                conc_and_area = [conc, dfList[i+j][0]]
                Master_dict[dfList[i+j][1]].append(conc_and_area)

    #create master matrix holding arrays of area values
    num_conc, num_chain = len(conc_array), len(Master_dict);
    Master = [[0 for x in range(num_conc*2+1)] for y in range(num_chain+1)]
    
    #populate column titles
    Master[0][0] = 'Chain Length'
    for i in range(1, num_conc*2+1):
        if i%2==0:
            Master[0][i] = 'Peak Area'
        else:
            Master[0][i] = 'Conc (mg/L)'
    
    #create matrix with desired layout as seen in Cell 2
    chain_iterator = 1
    for chain in Master_dict:
        conc_iterator = 1
        #label first column with chain
        Master[chain_iterator][0] = chain
        for conc in conc_array:
            area = []
            #search for correct area data
            for pair in Master_dict[chain]:
                if int(pair[0]) == conc:
                    area.append(pair[1])
                    break
            #this accounts for double-counting areas (only picks the largest one)
            area.sort(reverse = True)
            #this accounts for no area data
            if len(area) == 0:
                area = ['']
            #add to the master matrix
            Master[chain_iterator][conc_iterator] = conc
            conc_iterator+=1
            Master[chain_iterator][conc_iterator] = area[0]
            conc_iterator+=1
        chain_iterator+=1
        
    excelData = pd.DataFrame(data=Master, index=None)
    #set first row as header for aesthetics
    new_header = excelData.iloc[0]
    excelData = excelData[1:]
    excelData.columns = new_header
    #write to the excel file
    writer = pd.ExcelWriter(file, engine = 'openpyxl')
    writer.book = wb
    excelData.to_excel(writer, sheet_name = "EXT_STD", index = False)
    wb.save(filename = saveAs)
    return excelData

In [8]:
#7/27/2020 - Mike, I changed all 'sheet_name' to 'sheetname' because 'sheet_name' was causing problems on my end,
# so you will need to change it back later if it's causing issues on your end.

file = 'GCData-JGI_ACRs.xlsx' #file name
sheet = 'Peak_ID' #name of sheet containing area and chain identification data
area_col = 4 #column number of area data indexed starting at 0 i.e. column A is 0, column B is 1, etc.
chain_col = 5 #column number of chain identification indexed starting at 0 i.e. column A is 0, column B is 1, etc.
standards_used = {'C3':[],'C4':[],'C5':[],'C6':[],'C7':[],
                  'C8':[], 'C9':[], 'C10':[], 'C11':[], 'C12':[],
                  'C13':[], 'C14':[], 'C15':[], 'C16':[], 'C17':[]} #external standards used. Add or remove chains.
alc_acid_ID = 'FAOH' #FAOH or FAME, assumes the format: <FAME/FAOH>-<Concentration> e.g. FAOH-100 or FAME-2000
saveAs = 'GCData-JGI_ACRs.xlsx' #save as new/old file name
getExternalStdData(file, sheet, area_col, chain_col, standards_used, alc_acid_ID, saveAs)

           4        5
0      A10-1    A10-1
1       Area  Peak_ID
2        431       C3
3    4235909       C5
4    4351572       C5
..       ...      ...
708    59901      C13
709    63535      C14
710    64730      C15
711    64672      C16
712    75290      C17

[713 rows x 2 columns]
317
Area
318
3062
319
2126
320
3155
321
4485
322
5172
323
5359
324
5781
325
5943
326
6832
327
6733
328
7215
329
7555
330
8420
331
8487
332
9268
333
FAOH-100
334
Area
335
32427
336
20414
337
29701
338
42948
339
49388
340
51426
341
55544
342
55959
343
58710
344
61907
345
66346
346
67376
347
72272
348
67762
349
435
350
67421
351
FAOH-1000
352
Area
353
469104
354
241543
355
337997
356
502347
357
588810
358
618352
359
670818
360
658831
361
682233
362
707998
363
729820
364
723283
365
742848
366
677910
367
594
368
644150
369
FAOH-2000
370
Area
371
740509
372
531610
373
804287
374
954886
375
1019142
376
1099821
377
1089808
378
1119563
379
1149985
380
1179169
381
1173910
382
1178984
383
1083152
384
1024810
385
F

Unnamed: 0,Chain Length,Conc (mg/L),Peak Area,Conc (mg/L).1,Peak Area.1,Conc (mg/L).2,Peak Area.2,Conc (mg/L).3,Peak Area.3,Conc (mg/L).4,Peak Area.4,Conc (mg/L).5,Peak Area.5
1,C3,10,3062,50,18877,100,32427,500,231392,1000,469104,2000,740509.0
2,C4,10,2126,50,12053,100,20414,500,126804,1000,241543,2000,
3,C5,10,3155,50,17503,100,29701,500,178598,1000,337997,2000,531610.0
4,C6,10,4485,50,25285,100,42948,500,261472,1000,502347,2000,804287.0
5,C7,10,5172,50,28764,100,49388,500,297295,1000,588810,2000,954886.0
6,C8,10,5359,50,30207,100,51426,500,313020,1000,618352,2000,1019142.0
7,C9,10,5781,50,32428,100,55544,500,336594,1000,670818,2000,1099821.0
8,C10,10,5943,50,32775,100,55959,500,336066,1000,658831,2000,1089808.0
9,C11,10,6832,50,34519,100,58710,500,350711,1000,682233,2000,1119563.0
10,C12,10,6733,50,36394,100,61907,500,366312,1000,707998,2000,1149985.0
