In [1]:
    '''
    
    
    Parameters
    ----------
    
    Returns
    -------
    '''

'\n\n\nParameters\n----------\n\nReturns\n-------\n'

In [2]:
'''
Created on Jun 24, 2018

@author: Mark Holton
'''

import sys
import pandas as pd
import numpy as np
import qiime2

from qiime2 import Metadata


In [3]:
def get_user_input_query_lines(user_input_file_of_queries):
    '''
    converts user_input_file_of_queries file into a list of strings that represent the lines of the file

    Parameters
    ----------
    user_input_file_of_queries : string
        file that contains lines of stings  
    
    Returns
    -------
    lines : list 
        list of strings that are the lines of the file
    '''
    
    file = open('./%s.txt'%(user_input_file_of_queries),'r') 
    lines_of_queries = file.readlines()
    return lines_of_queries

In [4]:
def keep_samples(original_MD, keep_query_lines):
    '''
    Filters out unwanted rows based on values in chosen columns.
    
    Parameters
    ----------
    original_MD : Metadata object
        Metadata object with all samples 
    
    keep_query_lines : array of strings
        list of strings that are the lines of the file
        each string is a sqlite query that determines what ids to keep

    Returns
    -------
    original_MD : Metadata object
        original_MD input except that desired exclution has been applied so only the samples that match the input querys
        are kept
    '''
    ids = {}
    for query in keep_query_lines:
        ids = original_MD.get_ids( query )
        original_MD = original_MD.filter_ids(ids)
    return original_MD
    

In [5]:
def get_case_controlDF(afterExclution_MD, query_line_array, case_controlDF):
    '''
    Determines what samples are cases or controls using the queries in query_line_array. The labels of each sample are 
    stored in case_controlDF
    
    Parameters
    ----------
    afterExclution_MD : Metadata object
        Metadata object with unwanted samples filtered out
        
    query_line_array : array of 2 arrays of strings
        the arrays of strings are arrays of queries
        the first array are make of queries to determine controls
        the second array are make of queries to determine cases

        
    case_controlDF : dataframe
        dataframe with one column named case_control. The indexs are the same as the indexs of afterExclution_MD
        all values are Undefined
        
    Returns
    -------
    case_controlDF : dataframe
        dataframe with one column named case_control. The indexs are the same as the indexs of afterExclution_MD  
        values reflect if the index is a case, control, or Undefined
    '''
    afterExclution_MD_full = afterExclution_MD
    case_or_control = 'control'
    for query_lines in query_line_array:
        ids = {}
        afterExclution_MD = afterExclution_MD_full
        for query in query_lines:
            ids = afterExclution_MD.get_ids( query )
            afterExclution_MD = afterExclution_MD.filter_ids(ids)

        #replaces the true values created by the loop above to case or control
        case_controlDF.loc[ids,'case_control'] = case_or_control
        case_or_control = 'case'
    
    return case_controlDF

In [6]:
def merge_case_controlDF_and_afterExclutionMD(afterExclution_MD, case_controlDF):
    '''
    Combines case_controlDF with afterExclution_MD and returns it as a metadata object
    
    Parameters
    ----------
    afterExclution_MD : Metadata object
        Metadata object with unwanted samples filtered out
        
    case_controlDF : dataframe
        dataframe with one column named case_control. The indexs are the same as the indexs of afterExclution_MD  
        values reflect if the index is a case, control, or Undefined
        
    Returns
    -------
    Metadata(returnedMD) : Metadata object
        Metadata object with unwanted samples filtered out and a case_control column that reflects if the index is 
        a case, control, or Undefined    
    '''
    case_controlMD = Metadata( case_controlDF)
    mergedMD = Metadata.merge(afterExclution_MD, case_controlMD)
    returnedMD = mergedMD.to_dataframe()
    returnedMD.index.name = '#SampleID'
    return Metadata(returnedMD)


In [7]:
def filter_prep_for_matchMD(merged_MD, match_condition_lines):
    '''
    filters out samples that do not have valid entries for columns that determine matching
    
    Parameters
    ----------
    merged_MD : Metadata object
        has case_control with correct labels but some samples might not have all matching information
    
    match_condition_lines : array of strings
        contains conditons to match samples on. In this function it is used only to get the columns for matching.

    Returns
    -------
    merged_MD : Metadata object
        Samples that do not have valid entries for columns that determine matching are removed. Everything else is the
        same as merged_MD.
    '''
    for condition in match_condition_lines:
        column = condition.split('\t')[1]
        ids = merged_MD.get_ids(column +" NOT IN ('Unspecified', 'NaN')")
        merged_MD = merged_MD.filter_ids(ids)
    # metadataMD = non_filteredDF.dropna()



    return merged_MD


In [8]:
def match_labeling(match_frame, control_frame, case_index, case_row):
    '''
    Labels control samples with what case sample they match to in the matched to column
    
    Parameters
    ----------
    match_frame : dataframe
        what will have its 'matched to' column changed
    control_frame : dataframe
        contains the samples that match to case_row
    case_index : integer
        index of case_row in match_frame
    case_row : series
        case sample that matches to samples in control_frame

    Returns
    -------
    match_frame
        output is the updated match_frame with 'matched to' column reflecting the case id number it matches to for 
        controls. Cases are labeled 1 in 'matched to' column.
    '''
    for control_index, control_row in control_frame.iterrows():
        match_frame.at[control_index, 'matched to'] = case_index #str(case_row['#SampleID']) 
        #sets the case sample matched to value to 1 in master_frame
        match_frame.at[case_index, 'matched to'] = '1'#best match place holder'
    
    
    return match_frame

In [9]:
def match_samples(prepped_for_match_MD, conditions_for_match_lines):
    '''
    matches case samples to controls and puts the case's id in column matched to on the control sample's row
    
    Parameters
    ----------
    prepped_for_match_MD : Metadata object
        Samples that do not have valid entries for columns that determine matching are removed. Everything else is the
        same as merged_MD.
    
    conditions_for_match_lines : dataframe
        contains information on what conditions must be met to constitue a match
    
    Returns
    -------
    masterDF : dataframe
        masterDF with matches represented by a column called matched to. Values in matched to are sample id of the case
        sample the control sample matches to or 1 if it is a case sample
    
    '''

    matchDF = prepped_for_match_MD.to_dataframe()
    case_for_matchDF = matchDF[matchDF['case_control'].isin(['case'])]
    # creates column to show matches. since it will contain the sample number it was matched too the null value will be 0
    matchDF['matched to'] = '0'

    

    # loops though case samples and matches them to controls
    for index, row in case_for_matchDF.iterrows():
        
        # set matchDF to be only the samples of masterDF that are control samples
        controlDF = matchDF.copy()
        controlDF = controlDF[controlDF['case_control'].isin(['control'])]

        # loop though input columns to determine matches
        for conditions in conditions_for_match_lines:
            
            column_name = conditions.split('\t')[1]
            
            # get the type of data for the given column. This determine how a match is determined
            if conditions.split('\t')[0] == 'range':
                num = conditions.split('\t')[2]
                # filters controls based on if the value in the control is not within a given distance form the case
                controlDF = controlDF[
                                    ( pd.to_numeric(controlDF[column_name]) >= ( int(float(row[column_name])) - int(num) ) ) 
                                    &
                                    ( pd.to_numeric(controlDF[column_name]) <= ( int(float(row[column_name])) + int(num) ) )
                                   ] 
            else:
                # filters controls if the strings for the control and case don't match
                controlDF = controlDF[controlDF[column_name].isin([ row[column_name] ]) ]
            

        # sets the matched to column of masterDF to the case sample ID for the control samples still left in matchDF
        matchDF = match_labeling(matchDF, controlDF, index, row)

    
    return Metadata(matchDF)

In [10]:
#metadata file
file_of_metadata = 'qiime2-sample-metadata.csv' 
user_input_file_name_exclude = 'exclude-Copy1'
user_input_file_name_control = 'control-Copy1'
user_input_file_name_experiment = 'experiment-Copy1'
user_input_file_name_match = 'match-Copy1' 

In [11]:
#each line is a sqlite query to determine what samples to keep
exclude_query_lines_input = get_user_input_query_lines(user_input_file_name_exclude)
#each line is a sqlite query to determine what samples to label control
control_query_lines_input = get_user_input_query_lines(user_input_file_name_control)
#each line is a sqlite query to determine what samples to label case
case_query_lines_input = get_user_input_query_lines(user_input_file_name_experiment)
'''
each line is tab seperated
the first element is the type of match: range or exact
    range matches samples if the numerical values compared are with in some other number of eachother
        this is only to be used with numerical values
    exact matches samples if the values compared are exactly the same
        this can be used for strings and numbers
the second element is the column to compare values of for the case and control samples
the third element is the range number or = (if the match type is exact) 
    this determines how far away a sample can be from another sample for the given column to be matched
'''
match_condition_lines_input = get_user_input_query_lines(user_input_file_name_match)


MD stands for Metadata. Metadata objects will have MD at the end of it. DF stands for dataframe. Dataframe objects end with DF.

In [12]:
#read metadata file into metadata object
originalMD = Metadata.load( file_of_metadata )

originalMD.to_dataframe().iloc[0:5, :]

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,BodySite,Year,Month,Day,Subject,ReportedAntibioticUsage,DaysSinceExperimentStart,Description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
L1S8,AGCTGACTAGTC,GTGCCAGCMGCCGCGGTAA,gut,2008.0,10.0,28.0,subject-1,Yes,0.0,subject-1.gut.2008-10-28
L1S57,ACACACTATGGC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-1,No,84.0,subject-1.gut.2009-1-20
L1S76,ACTACGTGTGGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,2.0,17.0,subject-1,No,112.0,subject-1.gut.2009-2-17
L1S105,AGTGCGATGCGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-1,No,140.0,subject-1.gut.2009-3-17
L2S155,ACGATGCGACCA,GTGCCAGCMGCCGCGGTAA,left palm,2009.0,1.0,20.0,subject-1,No,84.0,subject-1.left-palm.2009-1-20


In [13]:
afterExclutionMD = keep_samples(originalMD, exclude_query_lines_input)
afterExclutionMD.to_dataframe()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,BodySite,Year,Month,Day,Subject,ReportedAntibioticUsage,DaysSinceExperimentStart,Description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
L1S57,ACACACTATGGC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-1,No,84.0,subject-1.gut.2009-1-20
L1S76,ACTACGTGTGGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,2.0,17.0,subject-1,No,112.0,subject-1.gut.2009-2-17
L1S105,AGTGCGATGCGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-1,No,140.0,subject-1.gut.2009-3-17
L1S208,CTGAGATACGCG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-2,No,84.0,subject-2.gut.2009-1-20
L1S257,CCGACTGAGATG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-2,No,140.0,subject-2.gut.2009-3-17
L1S281,CCTCTCGTGATC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,4.0,14.0,subject-2,No,168.0,subject-2.gut.2009-4-14


Lables the samples based on if they are control or not

In [14]:
ids = afterExclutionMD.get_ids()
case_control_Series = pd.Series( ['Unspecified'] * len(ids), ids)
'''
['Unspecified'] * len(ids) creates a list of elements. The list is the 
same length as ids. All the elements are 'Unspecified'
'''
case_control_Series.index.name='#SampleID'
case_controlDF = case_control_Series.to_frame('case_control') 
case_controlDF

Unnamed: 0_level_0,case_control
#SampleID,Unnamed: 1_level_1
L1S76,Unspecified
L1S57,Unspecified
L1S105,Unspecified
L1S208,Unspecified
L1S281,Unspecified
L1S257,Unspecified


In [15]:
case_controlDF = get_case_controlDF(afterExclutionMD, [control_query_lines_input, case_query_lines_input], case_controlDF)
case_controlDF

Unnamed: 0_level_0,case_control
#SampleID,Unnamed: 1_level_1
L1S76,control
L1S57,control
L1S105,control
L1S208,case
L1S281,case
L1S257,case


In [16]:
case_controlMD = merge_case_controlDF_and_afterExclutionMD(afterExclutionMD, case_controlDF)
case_controlMD.to_dataframe()


Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,BodySite,Year,Month,Day,Subject,ReportedAntibioticUsage,DaysSinceExperimentStart,Description,case_control
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L1S57,ACACACTATGGC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-1,No,84.0,subject-1.gut.2009-1-20,control
L1S76,ACTACGTGTGGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,2.0,17.0,subject-1,No,112.0,subject-1.gut.2009-2-17,control
L1S105,AGTGCGATGCGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-1,No,140.0,subject-1.gut.2009-3-17,control
L1S208,CTGAGATACGCG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-2,No,84.0,subject-2.gut.2009-1-20,case
L1S257,CCGACTGAGATG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-2,No,140.0,subject-2.gut.2009-3-17,case
L1S281,CCTCTCGTGATC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,4.0,14.0,subject-2,No,168.0,subject-2.gut.2009-4-14,case


Filters out data with match columns left unspecified

In [17]:
prepped_for_matchMD= filter_prep_for_matchMD(case_controlMD, match_condition_lines_input )
prepped_for_matchMD.to_dataframe()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,BodySite,Year,Month,Day,Subject,ReportedAntibioticUsage,DaysSinceExperimentStart,Description,case_control
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
L1S57,ACACACTATGGC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-1,No,84.0,subject-1.gut.2009-1-20,control
L1S76,ACTACGTGTGGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,2.0,17.0,subject-1,No,112.0,subject-1.gut.2009-2-17,control
L1S105,AGTGCGATGCGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-1,No,140.0,subject-1.gut.2009-3-17,control
L1S208,CTGAGATACGCG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-2,No,84.0,subject-2.gut.2009-1-20,case
L1S257,CCGACTGAGATG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-2,No,140.0,subject-2.gut.2009-3-17,case
L1S281,CCTCTCGTGATC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,4.0,14.0,subject-2,No,168.0,subject-2.gut.2009-4-14,case


Matches samples labled experiment to control samples

In [18]:
matchedMD = match_samples( prepped_for_matchMD, match_condition_lines_input )
matchedDF = matchedMD.to_dataframe()
matchedDF

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,BodySite,Year,Month,Day,Subject,ReportedAntibioticUsage,DaysSinceExperimentStart,Description,case_control,matched to
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
L1S57,ACACACTATGGC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-1,No,84.0,subject-1.gut.2009-1-20,control,L1S257
L1S76,ACTACGTGTGGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,2.0,17.0,subject-1,No,112.0,subject-1.gut.2009-2-17,control,L1S281
L1S105,AGTGCGATGCGT,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-1,No,140.0,subject-1.gut.2009-3-17,control,L1S281
L1S208,CTGAGATACGCG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,1.0,20.0,subject-2,No,84.0,subject-2.gut.2009-1-20,case,1
L1S257,CCGACTGAGATG,GTGCCAGCMGCCGCGGTAA,gut,2009.0,3.0,17.0,subject-2,No,140.0,subject-2.gut.2009-3-17,case,1
L1S281,CCTCTCGTGATC,GTGCCAGCMGCCGCGGTAA,gut,2009.0,4.0,14.0,subject-2,No,168.0,subject-2.gut.2009-4-14,case,1


In [19]:
matchedDF['matched to'].unique()

array(['L1S257', 'L1S281', '1'], dtype=object)