In [11]:
from collections import Counter
import numpy as np
import pandas as pd

In [12]:
def tissue_average_dataframe(spec_donors=[],reject_donors=[],donor_sex=None,donor_age=[],spec_tissues=[],reject_tissues=[],
                       top_tissues=None,min_tissue_count=None,output_name=None):
    
    #        This function constructs a pandas dataframe with ~10000 rows (transcripts) and a column for each tissue.
    #        Each column is the average of all columns of the same tissue.

    #        Most of the code is copied from functions in PCA_and_plot, so look there for details.
    #        Again here, one can select which donors, sexes, ages, and tissues are included.

    # DONORS BY ID
    if not spec_donors:
        all_donors = open('../data/donors_list.txt')
        donor_list = [ID[0:-1] for ID in all_donors if ID[0:-1] not in reject_donors]
        all_donors.close()
    else:
        donor_list = spec_donors
    # dictionary of donor IDs, and an array that will be [sex,age]
    donor_dict = dict((ID,[]) for ID in donor_list)

    # DONORS BY AGE AND SEX
    donor_info = open('../data/donor_info.txt')
    for line in donor_info:
        # info is [ID,sex,age,death]
        info = line.split('\t')
        if info[0] in donor_list:
            # check sex
            if donor_sex and info[1] != donor_sex:
                del donor_dict[info[0]]
                continue
            else:
                donor_dict[info[0]].append(info[1])
            # check age    
            age = info[2].split('-')[0]
            if donor_age and age not in donor_age:
                del donor_dict[info[0]]
                continue
            else:
                donor_dict[info[0]].append(age)
    donor_info.close()

    # TISSUES BY TYPE AND SAMPLE COUNT
    tissues = Counter()
    for ID in donor_dict.keys():
        metafile = open('../data/Donor_Metadata_Enhanced/donor_meta_'+ID+'.txt')
        # skip header lines
        next(metafile)
        next(metafile)
        next(metafile)
        next(metafile)
        for line in metafile:
            # look for tissue type listed in meta file
            tissue = line.split('\t')[1]
            if spec_tissues and tissue in spec_tissues:
                tissues[tissue] = tissues.get(tissue,0) + 1
            elif not spec_tissues and tissue not in reject_tissues:
                tissues[tissue] = tissues.get(tissue,0) + 1
        metafile.close()
    if min_tissue_count:
        # pick tissues with minimum number of samples
        tissue_list = [key for key,value in tissues.iteritems() if value >= min_tissue_count]
    else:
        # pick top most commonly sampled tissues
        tissue_list = [key for key,value in tissues.most_common(top_tissues)]

    # CONSTRUCT MATRIX
    avg_matrix = np.zeros((10000,len(tissue_list)))

    # metadata on relevant points
    for ID in donor_dict.keys():
        metafile = open('../data/Donor_Metadata_Enhanced/donor_meta_'+ID+'.txt')
        next(metafile)
        next(metafile)
        next(metafile)
        next(metafile)
        # column indices and types for relevant tissues
        columns = []
        column_tissue = []
        column = 0
        for line in metafile:
            tissue = line.split('\t')[1]
            if tissue in tissue_list:
                columns.append(column)
                column_tissue.append(tissue)
            column = column + 1
        metafile.close()
        # get data
        donor_matrix = np.zeros((10000,len(columns)))
        row = 0
        donorfile = open('../data/donor_matrices_fixed/donor_'+ID+'.txt')
        for line in donorfile:
            # from file, take desired tissue columns
            values = [line.split('\t')[ind] for ind in columns]
            # add them to appropriate columns of avg_matrix
            indices = [tissue_list.index(col_tis) for col_tis in column_tissue]
            avg_matrix[row,indices] = avg_matrix[row,indices] + [float(value) for value in values]
            row = row+1
        donorfile.close()
    
    # divide by number of samples to get average
    for col in range(len(tissue_list)):
        avg_matrix[:,col] = avg_matrix[:,col]/tissues[tissue_list[col]]

    df = pd.DataFrame(avg_matrix,columns=tissue_list)
    # save to file
    if output_name:
        df.to_csv(output_name,sep='\t')
        print 'Dataframe saved as ' + output_name
    return df

In [25]:
df = tissue_average_dataframe(spec_tissues=['Whole Blood'])

In [26]:
print df

(10000, 1)