In [None]:
%run ../config/init.py
import pysam

In [None]:
sra_df = pandas.read_csv(os.path.join(DATA, DATASET, 'sample_table.csv'), header=None)
sra_df

### Changing FASTA IDS to use the new GenBank IDs

From the TSA submission download the accession list to the submission directory.
Set the `ACCESSION_FILE` variable

In [None]:
ACCESSION_FILE = 'GISG03_accs'
submission_dir = os.path.join(RESULTS, DATASET, 'submission')
accs = pandas.read_csv(os.path.join(submission_dir, ACCESSION_FILE), sep='\t', header=None)
accs.head()

In [None]:
bucket_list = !gsutil ls
bucket = None

prefix = 'gs://{}-align-'.format(DATASET.lower())
for l in bucket_list:
    if prefix in l:
        bucket = l.replace('gs://{}-align-'.format(DATASET.lower()),'').replace('/','')
        break

inbucket = '{}-align-{}'.format(DATASET.lower(), bucket)
print('bucket: {}'.format(inbucket))

In [None]:
result_dir = os.path.join(RESULTS, DATASET, 'alignments')
os.chdir(result_dir)

### Quantifying Trinity genes for all samples.

In [None]:
def worker(sample):
    file = 'transcriptome-align-{0}/{1}/{1}_clean_noCont_sorted.bam'.format(bucket, sample)
    reads = {}
    with pysam.AlignmentFile(file, "rb") as samfile, open(os.path.join(result_dir, 'fasta_genbank_ids', 'transcriptome.fsa'), "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            gene_name = accs[accs[1] == record.id][0].iloc[0]
            gene_name = '_'.join(gene_name.split('_')[:4])
            iter = samfile.fetch(record.id)
            for x in iter:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
                if x.is_proper_pair:
                    v = reads.setdefault(x.qname, set())
                    v.add(gene_name)
{% else %}
                v = reads.setdefault(x.qname, set())
                v.add(gene_name)
{% endif %}

    genes = {}
    for r in reads:
        if len(reads[r]) == 1:
            gene = genes.setdefault(list(reads[r])[0], 0)
            genes[list(reads[r])[0]] += 1
    print('\n{}: Genes with unique reads {}'.format(sample, len(genes)))
    return (sample, genes, reads)

if not os.path.exists('Trinity_Genes_reads.tsv'):
    p = Pool(processes=12)
    data = p.map(worker, sra_df[0].unique()) 

    trans = {'len':{}}
    with open(os.path.join(result_dir, 'fasta_genbank_ids', 'transcriptome.fsa'), "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            gene_name = accs[accs[1] == record.id][0].iloc[0]
            gene_name = '_'.join(gene_name.split('_')[:4])
            if gene_name in trans['len']:
                if len(record.seq) > trans['len'][gene_name]:
                    trans['len'][gene_name] = len(record.seq)
            else:
                trans['len'][gene_name] = len(record.seq)

    df_reads = pandas.DataFrame.from_dict(trans, orient='columns')     
    for d in data:
        dd = pandas.DataFrame.from_dict(d[1], orient='index')
        dd = dd.rename(columns={0:d[0]})
        df_reads = df_reads.join(dd, how='outer')
    df_reads = df_reads.fillna(0)

    df_reads.to_csv('Trinity_Genes_reads.tsv', index_label='transcript', sep='\t')

    df = df_reads.copy()

    for s in sra_df[0].unique(): 
        sum_const = sum(df[s]/df.len)
        df[s] = (df[s] * 1000000)/(df.len * sum_const)

    df.to_csv('Trinity_Genes_TPM.tsv', index_label='transcript', sep='\t')
else:
    df_reads = pandas.read_csv('Trinity_Genes_reads.tsv', sep='\t')
    df = pandas.read_csv('Trinity_Genes_TPM.tsv', sep='\t')

In [None]:
df = pandas.read_csv('Trinity_Genes_reads.tsv', sep='\t')
df.head(20)

In [None]:
toPlot = []
for s in sra_df[0].unique(): 
    for r in df[s]:
        toPlot.append([r, s])
d = pandas.DataFrame(toPlot, columns=["Reads", 'Sample'])
ax = sns.boxplot(y='Sample', x="Reads", data=d, orient="h", palette="Set2")

In [None]:
df = pandas.read_csv('Trinity_Genes_TPM.tsv', sep='\t')
df.head(20)

In [None]:
toPlot = []
for s in sra_df[0].unique(): 
    for r in df[s]:
        toPlot.append([r, s])
d = pandas.DataFrame(toPlot, columns=["TPM", 'Sample'])
ax = sns.boxplot(y='Sample', x="TPM", data=d, orient="h", palette="Set2")
