In [2]:
import pandas as pd
import string

In [3]:
def parse_orig_metadata(metadata_path='../data/barcodes_BMC_orig_ds1.xlsx', 
                        outdir='../data/metadata_and_manifest/Blactam_metadata_ds1.tsv', 
                        save=True):
    
    """
    parse the original metadata file provided by BioMicro Center (https://openwetware.org/wiki/BioMicroCenter) 
    such that it is comptatible with the qiime2 metadata format. 
    this file will be exported as a tsv which can be uploaded to
    Google Sheets for additional validation via Keemei (https://keemei.qiime2.org/)
    
    parameters
        metadata_path: string
            path to original metadata file
        outdir: string
            directory and filename in which to output metadata file
        save: boolean
            indicate whether to save tsv file to specified directory
    returns:
        metadata_df: pandas dataframe
            dataframe containing metadata in qiime2-compatible format
    """
    
    # parse through 180 samples
    metadata_bmc_orig_df = pd.read_excel(metadata_path).loc[0:179,:]
    
    # list in which to store metadata
    metadata_bmc_parsed_list = []
    # specify variable types
    metadata_bmc_parsed_list.append({
        'sample-id': '#q2:types',
        'barcode-sequence': 'categorical',
        'treatment-group': 'categorical',
        'time-point-day': 'categorical',
        'animal-number': 'categorical',
        'treatment-group-time-mouse': 'categorical',
        'time-treatment-group-mouse': 'categorical',
        'treatment-group-time': 'categorical'     
    })
    
    # create dictionaries that map treatment day and animal numbers to letters -- this will help organize taxa barplots
    alphabet_string = string.ascii_lowercase
    alphabet_list = list(alphabet_string)
    alphabet_list = [letter.upper() for letter in alphabet_list]
    alphabet_list_trun = alphabet_list[0:10]
    alphabet_list_trun.reverse()
    alphabet_list_trun = alphabet_list_trun + ['J', 'I', 'H']

    treatment_days = [0, 1, 2, 3, 5, 7, 9, 12, 'L1', 'L2', 'L3']
    animal_numbers = list(range(1,21))
    treatment_days_to_letter_dict = dict(zip(treatment_days,alphabet_list_trun))
    animal_numbers_to_letter_dict = dict(zip(animal_numbers,alphabet_list_trun[0:10] +  alphabet_list_trun[0:10]))

    # parse every row for metadata
    for index, row in metadata_bmc_orig_df.iterrows():
        
        # parsing information from first column (should contain sample-id and barcode)
        split_sample_name = row['Unnamed: 0'].replace('_','-').replace('\xa0','').split(':')
        sample_id = split_sample_name[1].split(' ')[1]
        sample_barcode = split_sample_name[1].replace('(','').replace(')','').split(' ')[-1]

        # extracting information from other columns
        treatment_group = row['Treatment Group']
        time_point_day = row['Time Point (Day)']
        animal_number = row['Animal Number']
        
        treatment_group_time_mouse = treatment_group + '-' + treatment_days_to_letter_dict[time_point_day] \
        + 'day' + str(time_point_day) + '-' + animal_numbers_to_letter_dict[animal_number] + 'animal' + str(animal_number) 
        
        time_treatment_group_mouse = treatment_days_to_letter_dict[time_point_day] \
        + 'day' + str(time_point_day) + '-' + treatment_group \
        + '-' + animal_numbers_to_letter_dict[animal_number] + 'animal' + str(animal_number) 
        
        treatment_group_time = treatment_group + '-' + 'day' + str(time_point_day)
        
        # update list with metadata
        metadata_bmc_parsed_list.append({
        'sample-id': sample_id,
        'barcode-sequence': sample_barcode,
        'treatment-group': treatment_group,
        'time-point-day': time_point_day,
        'animal-number': animal_number,
        'treatment-group-time-mouse': treatment_group_time_mouse,
        'time-treatment-group-mouse': time_treatment_group_mouse,
        'treatment-group-time': treatment_group_time     
    })
        
    metadata_df = pd.DataFrame(metadata_bmc_parsed_list)
    
    if save:
        metadata_df.to_csv(outdir, sep='\t', index=False)
    return(metadata_df)
    

In [4]:
def create_manifest_file(metadata_df, 
                         outdir='../data/metadata_and_manifest/Blactam_manifest_ds1.tsv', 
                         save=True):
    
    """
    create a manifest file detailing the location of all sequencing reads.
    the manifest file will be exported as a tsv file
    
    parameters
        metadata_df: pandas dataframe
            metadata dataframe that contains sample-ids
        outdir: string
            directory and filename in which to output manifest file
        save: boolean
            indicate whether to save tsv file to specified directory
    returns
        manifest_df: pandas dataframe
            manifest file detailing the directory path of sequencing data
        
    """
    
    sample_numbers = ["%.3d" % i for i in range(1,181)]
    
    # path where sequencing data will be located 
    seq_path_prefix = '$PWD/../data/Blactam_16s_data/210616ChiA/'
    manifest_file_list = []

    for samp_num, sample_id in zip(sample_numbers, list(metadata_df.iloc[1:,0])):

        samp_num_full = 'D21-217' + samp_num 
        
        # data run on two separate lanes, so need to consider differing file suffix
        # sample 001 through 090 were run on 5164T
        # sample 091 through 180 were run on 5163T
        if int(samp_num) <=90:
            lane_code = '-5164T/'
        else:
            lane_code = '-5163T/'
        forward_seq_path = seq_path_prefix + samp_num_full + lane_code + '210616ChiA_' + samp_num_full + '_' + '1_sequence.fastq'
        reverse_seq_path = seq_path_prefix + samp_num_full + lane_code + '210616ChiA_' + samp_num_full + '_' + '2_sequence.fastq'

        manifest_file_list.append({
            'sample-id': sample_id,
            'forward-absolute-filepath': forward_seq_path,
            'reverse-absolute-filepath': reverse_seq_path
        })

        manifest_df = pd.DataFrame(manifest_file_list)

    if save:
        manifest_df.to_csv(outdir, sep='\t', index=False)
    return(manifest_df)


In [5]:
metadata_path = '../data/barcodes_BMC_orig_ds1.xlsx'

# will be updated manually (e.g., create a v2 and v3) as needed
metadata_df = parse_orig_metadata(metadata_path, outdir='../data/metadata_and_manifest/Blactam_metadata_ds1.tsv', save=True)

manifest_df = create_manifest_file(metadata_df, outdir='../data/metadata_and_manifest/Blactam_manifest_ds1.tsv', save=True)
