In [1]:
import os
import glob

In [2]:
import pandas as pd

In [3]:
data_dir = '/data/hts/2018/foot'

In [4]:
fs = glob.glob(os.path.join(data_dir, '*.tab'))

In [5]:
from functools import reduce

In [6]:
d = {}

for f in fs[0:]:
    path, filename = os.path.split(f)
    name, ext = os.path.splitext(filename)
    sid, method, person, _, lane, _, _ = name.split('_')
    d.setdefault((sid, method, person), []).append(filename)

In [7]:
dfs = []
for key in d:
    dfs_ = []
    for filename in sorted(d[key]):
        df_ = pd.read_table(os.path.join(data_dir, filename), header=None, skiprows=4)
        df_.columns = ['gene', 'a', 'b', 'c']
        df_ = df_.iloc[:, [0,3]]
        dfs_.append(df_)
    sid, method, person = key
    df = reduce(lambda left,right: pd.merge(left, right, on='gene'), dfs_)
    df['count'] = df.sum(axis=1)
    df['sid'] = sid
    df = df[['sid', 'gene', 'count']]
    df = df.pivot('sid', 'gene', 'count')
    df['person'] = person
    df['method'] = method
    df.reset_index(level=0, inplace=True)
    dfs.append(df)
df_all = pd.concat(dfs)

In [8]:
df_all.iloc[:5,:5]

gene,sid,gene0,gene1,gene10,gene100
0,9,0,0,11,186
0,27,0,0,1,72
0,36,1,0,9,192
0,3,0,0,7,168
0,45,3,0,21,162


In [9]:
df_all = df_all[df_all.columns[0:1].tolist() + df_all.columns[-2:] .tolist() + df_all.columns[1:-2].tolist()]

In [10]:
df_all['sid'] = df_all['sid'].astype('int')

In [11]:
df_all = df_all.sort_values(['sid', 'person', 'method'])

In [12]:
df_all['Label'] = df_all['sid'].astype('str') + '_' + df_all['method'] + '_' + df_all['person'] 

In [13]:
meta = pd.read_table('../josh/info/2018_pilot_metadata.tsv')

In [14]:
meta.head()

Unnamed: 0,Label,RNA_sample_num,Media,Strain,Replicate,experiment_person,libprep_person,enrichment_method,RIN,concentration_fold_difference,i7 index,i5 index,i5 primer,i7 primer,library#
0,2_MA_C,2,YPD,H99,2,expA,prepA,MA,10.0,1.34,ATTACTCG,AGGCTATA,i501,i701,1
1,9_MA_C,9,YPD,mar1d,3,expA,prepA,MA,10.0,2.23,ATTACTCG,GCCTCTAT,i502,i701,2
2,10_MA_C,10,YPD,mar1d,4,expA,prepA,MA,9.9,4.37,ATTACTCG,AGGATAGG,i503,i701,3
3,14_MA_C,14,TC,H99,2,expA,prepA,MA,10.0,1.57,ATTACTCG,TCAGAGCC,i504,i701,4
4,15_MA_C,15,TC,H99,3,expA,prepA,MA,9.9,2.85,ATTACTCG,CTTCGCCT,i505,i701,5


In [15]:
df_all = pd.merge(df_all, meta[['Label', 'Media', 'Strain']], on='Label', how='inner')

In [16]:
cols = df_all.columns[:3].tolist() + df_all.columns[-3:].tolist() + df_all.columns[3:-3].tolist()

In [17]:
df_all = df_all[cols]

In [18]:
df_all.to_csv('data/gene_counts.txt', sep='\t', index=False)