In [13]:
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import math
import openpyxl
from statistics import * 

In [14]:
#This code will parse through the Peak_ID worksheet and generate another worksheet in the same file called "External_STD"
#with concentration and peak area data for C7, C8, ..., C17 arranged like so:
#
# Chain Length | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area
# _____________________________________________________________________________________________________________________
#      C7      |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      C8      |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      ...     |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      C17     |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#


###Important note: The standard data entries need to be included in the dataframe such that they are not the first nor the last entry. The loop throws an index out of range error otherwise.

In [15]:
def getExternalStdData(file, sheet, area_col, chain_col, standards_used, alc_acid_ID, saveAs):
    col = [area_col, chain_col]
    wb = openpyxl.load_workbook(file)
    sheet_name = wb[sheet]
    #create dataframe from relevant columns only
    df = pd.read_excel(file, sheet_name=sheet, usecols = col, header = None)
    print(df)
    #convert dataframe to array to allow array traversals
    dfList = df.values.tolist()
    print(dfList)
    #Master_dict will hold area and concentration data 
    Master_dict = standards_used

    # creates list of external standard title indicies (location of titles)
    alc_acid_indicies = []
    for i in range(len(dfList)):
        name = dfList[i][1]
        if name[0:name.find('-')] == alc_acid_ID:
            alc_acid_indicies.append(i)
    
    conc_array = []
    titleTypes = ['Peak#', 'R.Time', 'I.Time', 'F.Time', 'Area', 'Height']
    # start adding to master matrix here using indicies of titles
    for i in alc_acid_indicies:
        #find the position right before the start of a new set of data as indicated by a string
        endindex = i
        for k in range(len(dfList[i:len(dfList)])):
            endindex+=1
            print(endindex)
            testVar = dfList[endindex][0]
            print(testVar)
            if(isinstance(testVar,str) and testVar not in titleTypes):
                break
        #parse title to find concentration (gets the number after '-' in the title) and add to conc_array and sort
        conc = dfList[i][1][dfList[i][1].find('-')+1:len(dfList[i][1])]
        conc_array.append(int(conc))
        conc_array.sort()
        #for each chain, add the concentration and area data to the Master dictionary
        for j in range(endindex-i):
            if(isinstance(dfList[i+j][0], int)):
                conc_and_area = [conc, dfList[i+j][0]]
                Master_dict[dfList[i+j][1]].append(conc_and_area)

    #create master matrix holding arrays of area values
    num_conc, num_chain = len(conc_array), len(Master_dict);
    Master = [[0 for x in range(num_conc*2+1)] for y in range(num_chain+1)]
    
    #populate column titles
    Master[0][0] = 'Chain Length'
    for i in range(1, num_conc*2+1):
        if i%2==0:
            Master[0][i] = 'Peak Area'
        else:
            Master[0][i] = 'Conc (mg/L)'
    
    #create matrix with desired layout as seen in Cell 2
    chain_iterator = 1
    for chain in Master_dict:
        conc_iterator = 1
        #label first column with chain
        Master[chain_iterator][0] = chain
        for conc in conc_array:
            area = []
            #search for correct area data
            for pair in Master_dict[chain]:
                if int(pair[0]) == conc:
                    area.append(pair[1])
                    break
            #this accounts for double-counting areas (only picks the largest one)
            area.sort(reverse = True)
            #this accounts for no area data
            if len(area) == 0:
                area = ['']
            #add to the master matrix
            Master[chain_iterator][conc_iterator] = conc
            conc_iterator+=1
            Master[chain_iterator][conc_iterator] = area[0]
            conc_iterator+=1
        chain_iterator+=1
        
    excelData = pd.DataFrame(data=Master, index=None)
    #set first row as header for aesthetics
    new_header = excelData.iloc[0]
    excelData = excelData[1:]
    excelData.columns = new_header
    #write to the excel file
    writer = pd.ExcelWriter(file, engine = 'openpyxl')
    writer.book = wb
    excelData.to_excel(writer, sheet_name = "EXT_STD", index = False)
    wb.save(filename = saveAs)
    return excelData

In [16]:
#7/27/2020 - Mike, I changed all 'sheet_name' to 'sheetname' because 'sheet_name' was causing problems on my end,
# so you will need to change it back later if it's causing issues on your end.

file = 'GCData_C10TE_FAME.xlsx' #file name

# 11-19-2020 had to manually mess with the external standard data in the Quantification w IS,ES tab. Needed to delete lines with 12_1, 14_1, 16_1

sheet = 'Quantification w IS,ES' #name of sheet containing area and chain identification data
area_col = 3 #column number of area data indexed starting at 0 i.e. column A is 0, column B is 1, etc.
chain_col = 5 #column number of chain identification indexed starting at 0 i.e. column A is 0, column B is 1, etc.
standards_used = {'C7':[],'C8':[], 'C9':[], 'C10':[], 'C11':[], 'C12':[],
                  'C13':[], 'C14':[], 'C15':[], 'C16':[], 'C17':[], 'C18_1':[], 'C18':[]} #external standards used. Add or remove chains.
alc_acid_ID = 'FAME' #FAOH or FAME, assumes the format: <FAME/FAOH>-<Concentration> e.g. FAOH-100 or FAME-2000
saveAs = 'GCData_C10TE_FAME.xlsx' #save as new/old file name
getExternalStdData(file, sheet, area_col, chain_col, standards_used, alc_acid_ID, saveAs)

             3          5
0    ChFatB2-1  ChFatB2-1
1       F.Time    Peak_ID
2        67401         C7
3       185926         C8
4       238545         C9
..         ...        ...
287      27043      C16_1
288      37736        C16
289      64212        C17
290      69550      C18_1
291       1533        C18

[292 rows x 2 columns]
[['ChFatB2-1', 'ChFatB2-1'], ['F.Time', 'Peak_ID'], [67401, 'C7'], [185926, 'C8'], [238545, 'C9'], [91782, 'C10'], [309440, 'C11'], [4813, 'C12_1'], [5077, 'C12'], [349098, 'C13'], [2180, 'C14_1'], [11714, 'C14'], [58812, 'C15'], [21594, 'C16_1'], [99189, 'C16'], [62927, 'C17'], [31499, 'C18_1'], [3538, 'C18'], ['ChFatB2-2', 'ChFatB2-2'], ['F.Time', 'Peak_ID'], [91168, 'C7'], [197289, 'C8'], [279760, 'C9'], [95450, 'C10'], [367492, 'C11'], [5161, 'C12_1'], [5107, 'C12'], [415416, 'C13'], [2383, 'C14_1'], [12809, 'C14'], [69850, 'C15'], [24043, 'C16_1'], [113565, 'C16'], [74539, 'C17'], [38089, 'C18_1'], [4143, 'C18'], ['ChFatB2-3', 'ChFatB2-3'], ['F.Time',

Unnamed: 0,Chain Length,Conc (mg/L),Peak Area,Conc (mg/L).1,Peak Area.1,Conc (mg/L).2,Peak Area.2,Conc (mg/L).3,Peak Area.3,Conc (mg/L).4,Peak Area.4
1,C7,25,13404,50,35344,100,51809,500,371446,1000,710225
2,C8,25,13540,50,35831,100,52785,500,379696,1000,720618
3,C9,25,14982,50,40111,100,58931,500,425129,1000,802022
4,C10,25,15422,50,41177,100,60824,500,438411,1000,823777
5,C11,25,15719,50,42206,100,62513,500,452843,1000,846202
6,C12,25,15792,50,42769,100,63446,500,461495,1000,857547
7,C13,25,17259,50,46322,100,68818,500,503722,1000,929902
8,C14,25,21031,50,44228,100,94286,500,520167,1000,1079785
9,C15,25,22018,50,46724,100,99450,500,546896,1000,1135468
10,C16,25,22308,50,47587,100,101563,500,558591,1000,1160369
