In [5]:
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import math
import openpyxl
from statistics import * 

In [6]:
#This code will parse through the Peak_ID worksheet and generate another worksheet in the same file called "External_STD"
#with concentration and peak area data for C7, C8, ..., C17 arranged like so:
#
# Chain Length | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area | Conc (mg/L) | Peak Area
# _____________________________________________________________________________________________________________________
#      C7      |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      C8      |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      ...     |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#      C17     |    1000     |     #     |     500     |     #     |     100     |     #     |     25      |     #
#


###Important note: The standard data entries need to be included in the dataframe such that they are not the first nor the last entry. The loop throws an index out of range error otherwise.

In [7]:
def getExternalStdData(file, sheet, area_col, chain_col, standards_used, alc_acid_ID, saveAs):
    col = [area_col, chain_col]
    wb = openpyxl.load_workbook(file)
    sheet_name = wb[sheet]
    #create dataframe from relevant columns only
    df = pd.read_excel(file, sheet_name=sheet, usecols = col, header = None)
    print(df)
    #convert dataframe to array to allow array traversals
    dfList = df.values.tolist()
    print(dfList)
    #Master_dict will hold area and concentration data 
    Master_dict = standards_used

    # creates list of external standard title indicies (location of titles)
    alc_acid_indicies = []
    for i in range(len(dfList)):
        name = dfList[i][1]
        if name[0:name.find('-')] == alc_acid_ID:
            alc_acid_indicies.append(i)
    
    conc_array = []
    titleTypes = ['Peak#', 'R.Time', 'I.Time', 'F.Time', 'Area', 'Height']
    # start adding to master matrix here using indicies of titles
    for i in alc_acid_indicies:
        #find the position right before the start of a new set of data as indicated by a string
        endindex = i
        for k in range(len(dfList[i:len(dfList)])):
            endindex+=1
            print(endindex)
            testVar = dfList[endindex][0]
            print(testVar)
            if(isinstance(testVar,str) and testVar not in titleTypes):
                break
        #parse title to find concentration (gets the number after '-' in the title) and add to conc_array and sort
        conc = dfList[i][1][dfList[i][1].find('-')+1:len(dfList[i][1])]
        conc_array.append(int(conc))
        conc_array.sort()
        #for each chain, add the concentration and area data to the Master dictionary
        for j in range(endindex-i):
            if(isinstance(dfList[i+j][0], int)):
                conc_and_area = [conc, dfList[i+j][0]]
                Master_dict[dfList[i+j][1]].append(conc_and_area)

    #create master matrix holding arrays of area values
    num_conc, num_chain = len(conc_array), len(Master_dict);
    Master = [[0 for x in range(num_conc*2+1)] for y in range(num_chain+1)]
    
    #populate column titles
    Master[0][0] = 'Chain Length'
    for i in range(1, num_conc*2+1):
        if i%2==0:
            Master[0][i] = 'Peak Area'
        else:
            Master[0][i] = 'Conc (mg/L)'
    
    #create matrix with desired layout as seen in Cell 2
    chain_iterator = 1
    for chain in Master_dict:
        conc_iterator = 1
        #label first column with chain
        Master[chain_iterator][0] = chain
        for conc in conc_array:
            area = []
            #search for correct area data
            for pair in Master_dict[chain]:
                if int(pair[0]) == conc:
                    area.append(pair[1])
                    break
            #this accounts for double-counting areas (only picks the largest one)
            area.sort(reverse = True)
            #this accounts for no area data
            if len(area) == 0:
                area = ['']
            #add to the master matrix
            Master[chain_iterator][conc_iterator] = conc
            conc_iterator+=1
            Master[chain_iterator][conc_iterator] = area[0]
            conc_iterator+=1
        chain_iterator+=1
        
    excelData = pd.DataFrame(data=Master, index=None)
    #set first row as header for aesthetics
    new_header = excelData.iloc[0]
    excelData = excelData[1:]
    excelData.columns = new_header
    #write to the excel file
    writer = pd.ExcelWriter(file, engine = 'openpyxl')
    writer.book = wb
    excelData.to_excel(writer, sheet_name = "EXT_STD", index = False)
    wb.save(filename = saveAs)
    return excelData

In [13]:
#7/27/2020 - Mike, I changed all 'sheet_name' to 'sheetname' because 'sheet_name' was causing problems on my end,
# so you will need to change it back later if it's causing issues on your end.

file = 'GCData_C10TE_FAME.xlsx' #file name
file = 'GCData_TE_ML.xlsx'
# 11-19-2020 had to manually mess with the external standard data in the Quantification w IS,ES tab. Needed to delete lines with 12_1, 14_1, 16_1

sheet = 'Quantification w IS,ES' #name of sheet containing area and chain identification data
area_col = 3 #column number of area data indexed starting at 0 i.e. column A is 0, column B is 1, etc.
chain_col = 5 #column number of chain identification indexed starting at 0 i.e. column A is 0, column B is 1, etc.
standards_used = {'C7':[],'C8':[], 'C9':[], 'C10':[], 'C11':[], 'C12':[],
                  'C13':[], 'C14':[], 'C15':[], 'C16':[], 'C17':[], 'C18_1':[], 'C18':[]} #external standards used. Add or remove chains.
alc_acid_ID = 'FAME' #FAOH or FAME, assumes the format: <FAME/FAOH>-<Concentration> e.g. FAOH-100 or FAME-2000
saveAs = 'GCData_C10TE_FAME.xlsx' #save as new/old file name


file = 'GCData_for_Ryan_w_FAME_STD_Data.xlsx'
sheet = 'Peak_ID'
area_col = 4
chain_col = 5
standards_used = {'C7':[], 'C8':[], 'C9':[], 'C10':[], 'C11':[], 'C12':[], 
                  'C13':[], 'C14':[], 'C15':[], 'C16':[], 'C17':[], 'C18':[]}
alc_acid_ID = 'FAME'
saveAs = 'GCData_for_Ryan_w_FAME_STD_Data.xlsx'


getExternalStdData(file, sheet, area_col, chain_col, standards_used, alc_acid_ID, saveAs)

                 4              5
0    133_222_227-1  133_222_227-1
1             Area        Peak_ID
2           117671             C7
3              717             C8
4           213338             C9
..             ...            ...
703         276457            C14
704          47646            C15
705          58214            C16
706          47608            C17
707          33447            C18

[708 rows x 2 columns]
[['133_222_227-1', '133_222_227-1'], ['Area', 'Peak_ID'], [117671, 'C7'], [717, 'C8'], [213338, 'C9'], [915, 'C10'], [6567, 'C10'], [256043, 'C11'], [1286, 'C12'], [340389, 'C12'], [275255, 'C13'], [1505, 'C13'], [22882, 'C14'], [55907, 'C15'], [245, 'C15'], [54438, 'C16'], [55584, 'C17'], [61119, 'C18'], [2378, 'C18'], [529, 'C18'], ['133_222_227-2', '133_222_227-2'], ['Area', 'Peak_ID'], [91992, 'C7'], [598, 'C8'], [159628, 'C9'], [685, 'C10'], [4215, 'C10'], [188016, 'C11'], [877, 'C12'], [255908, 'C12'], [203644, 'C13'], [1322, 'C13'], [17027, 'C14'], [41175

Unnamed: 0,Chain Length,Conc (mg/L),Peak Area,Conc (mg/L).1,Peak Area.1,Conc (mg/L).2,Peak Area.2,Conc (mg/L).3,Peak Area.3,Conc (mg/L).4,Peak Area.4
1,C7,25,6568,50,18037,100,27234,500,203623,1000,395678
2,C8,25,7016,50,19403,100,29426,500,225251,1000,428353
3,C9,25,7944,50,22259,100,33552,500,256269,1000,474564
4,C10,25,8581,50,24201,100,37098,500,279590,1000,512577
5,C11,25,9002,50,25586,100,39172,500,298924,1000,534338
6,C12,25,9144,50,26091,100,40434,500,303818,1000,550499
7,C13,25,9713,50,27660,100,43053,500,333426,1000,585422
8,C14,25,12475,50,27049,100,75839,500,320971,1000,588166
9,C15,25,13494,50,28845,100,68495,500,330058,1000,606821
10,C16,25,13904,50,29688,100,72643,500,337531,1000,611575


In [9]:
name[0:name.find('-')] == alc_acid_ID:

SyntaxError: invalid syntax (Temp/ipykernel_43356/262582928.py, line 1)

In [11]:
string = 'FAME1000'
string[0:string.find('-')]

'FAME100'

In [12]:
string.find('-')

-1