In [None]:
import pandas as pd

In [None]:
teTELR = '/D_mel_transposon_sequence_set.fa' # from https://github.com/bergmanlab/drosophila-transposons/blob/master/current/D_mel_transposon_sequence_set.fa

def load_ref_te(teTELR):
    tes = {}
    tes_by_subfamily = {}
    subfamily_order = {}
    for seq_record in SeqIO.parse(teTELR, 'fasta'):
        seqid = seq_record.id
        ln = seqid.split('#')
        subfamily = ln[0]
        order = ln[1].split('/')[0]
        superfamily = ln[1].split('/')[1]
        if order not in tes:
            tes[order] = {}
        if superfamily not in tes[order]:
            tes[order][superfamily] = {}

        tes[order][superfamily][subfamily] = seq_record.seq
        tes_by_subfamily[subfamily] = seq_record.seq
        subfamily_order[subfamily] = order
    ref_te_len = {}
    for order in tes.keys():
        for superfamily in tes[order]:
            for subfamily in tes[order][superfamily]:            
                ref_te_len[subfamily] = len(str(tes[order][superfamily][subfamily]))
        
    return tes, tes_by_subfamily, ref_te_len, subfamily_order

tes, tes_by_subfamily, ref_te_len, subfamily_to_order = load_ref_te(teTELR)

In [None]:
samples = ['PGFP_5d_guts_P2', 'PGFP_5d_heads_P2',  
           'PGFP_5d_guts_P3', 'PGFP_5d_heads_P3', 
           'PGFP_25d_guts_P1', 'PGFP_25d_guts_P2', 
           'PGFP_25d_heads_P1', 'PGFP_25d_heads_P2', 
           'PGFP_50d_guts_P1', 'PGFP_50d_heads_P1',
           'PGFP_50d_guts_P2', 'PGFP_50d_heads_P2'
          ]

samples_heads = ['PGFP_5d_heads_P2', 'PGFP_5d_heads_P3', 
                 'PGFP_25d_heads_P1', 'PGFP_25d_heads_P2', 
                 'PGFP_50d_heads_P1', 'PGFP_50d_heads_P2'
                ]

samples_guts = ['PGFP_5d_guts_P2', 'PGFP_5d_guts_P3', 
                'PGFP_25d_guts_P1', 'PGFP_25d_guts_P2', 
                'PGFP_50d_guts_P1', 'PGFP_50d_guts_P2'
               ]

In [None]:
result = pd.read_csv('~/final_table.csv', sep='\t', index_col=0, sep='\t')

In [None]:
tmp = result[result['Genotype']=='Singleton']
counts = {}
countslong = []
counts_guts = {}
counts_guts_long = []
counts_heads = {}
counts_heads_long = []
for subfamily in singletons:
    counts[subfamily] = []
    counts_heads[subfamily] = []
    counts_guts[subfamily] = []
    
    for sample in samples:
        length = len(tmp[(tmp['Subfamily'] == subfamily) & (tmp.Sample.str.contains(sample)  ) ] )

        counts[subfamily].append( length )
        countslong.append([subfamily, sample, length])
    for sample in samples_guts:
        length = len(tmp[(tmp['Subfamily'] == subfamily) & (tmp.Sample.str.contains(sample)  ) ] )

        counts_guts[subfamily].append(length)
        counts_guts_long.append([subfamily, sample, length])
    for sample in samples_heads:
        length = len(tmp[(tmp['Subfamily'] == subfamily) & (tmp.Sample.str.contains(sample)  ) ] )

        counts_heads[subfamily].append(length)
        counts_heads_long.append([subfamily, sample, length])
        
countsdf = pd.DataFrame.from_dict(counts, orient='index')
countsdf.columns = samples

In [None]:
counts_guts_long_df = pd.DataFrame(counts_guts_long)
counts_guts_long_df.columns = ['Subfamily', 'Sample', 'Count']

In [None]:
subfamilies = ['rover', 'copia', 'springer', 'I-element', 'Bari1']
tissue = 'guts'
order = ['PGFP_5d_guts_P2','PGFP_5d_guts_P3', 
     'PGFP_25d_guts_P1','PGFP_25d_guts_P2',
     'PGFP_50d_guts_P1','PGFP_50d_guts_P2']
    
for subfamily in subfamilies:
    df = counts_guts_long_df[counts_guts_long_df['Subfamily'] == subfamily]
    df.reset_index(drop=True, inplace=True)
    df.to_csv('~/' + subfamily + '_' + tissue + '_raw_counts.csv', sep='\t')

In [None]:
def write_read_cout_per_bin_from_awk(awk_file, outfile):
    read_length_data = {}
    with open(awk_file) as indata, \
        open(outfile, 'w') as outdata:
        for i, record in enumerate(indata):
            bn = int(record.split('\t')[1]) // 1000
            if bn not in read_length_data:
                read_length_data[bn] = 0
            read_length_data[bn] += 1
        for bn in sorted(read_length_data):
            outdata.write('\t'.join([str(bn), str(read_length_data[bn])]) + '\n')

In [None]:
read_length_data = {}
read_length_data_long = []

for sample in samples:
    awk_file = '~/' + sample + '.porechop.sorted.flt.primary.mapped.awk.readlength' #line format: read_id \t read_length
    outfile = '~/' + sample + '.porechop.sorted.flt.primary.mapped.readlength'
    write_read_cout_per_bin_from_awk(awk_file, outfile)
    read_length_data[sample] = {}
    with open(outfile) as indata:
        for i, l in enumerate(indata):
            ln = l.rstrip().split('\t')
            bn = int(ln[0])
            count = int(ln[1])
            read_length_data[sample][bn] = count
            read_length_data_long.append([sample, bn, count])

In [None]:
def normalize_cpm(counts_long, ref_te_length, read_length_data):
    counts_long_norm = []
    for entry in counts_long:
        subfamily = entry[0]
        if subfamily not in somatic:
            continue
        sample = entry[1]
        count = int(entry[2])
        te_length_bin = (ref_te_len[subfamily] * 3.5) // 1000 
        sum_counts = 0
        for bn in read_length_data[sample].keys():
            if bn >= te_length_bin:
                sum_counts += read_length_data[sample][bn]
        normalized_count = count * 1000 / sum_counts
        counts_long_norm.append([subfamily, sample, count, normalized_count])
    return counts_long_norm

In [None]:
counts_guts_long_norm = normalize_cpm(counts_guts_long, ref_te_len, read_length_data)
counts_guts_long_norm_df = pd.DataFrame(counts_guts_long_norm)
counts_guts_long_norm_df.columns = ['Subfamily', 'Sample', 'Count', 'Normalized_count']
counts_guts_long_norm_df.set_index('Sample', inplace=True)

counts_guts_long_norm_df.to_csv('~/counts_guts_raw_norm.csv', sep='\t')