In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
from environment import *

from setting import *

In [None]:
fastqs = (
    (
        '../data/fastq/NC1_1.fq.gz',
        '../data/fastq/NC1_2.fq.gz',
    ),
    (
        '../data/fastq/NC2_1.fq.gz',
        '../data/fastq/NC2_2.fq.gz',
    ),
    (
        '../data/fastq/NC3_1.fq.gz',
        '../data/fastq/NC3_2.fq.gz',
    ),
    (
        '../data/fastq/NM1_1.fq.gz',
        '../data/fastq/NM1_2.fq.gz',
    ),
    (
        '../data/fastq/NM2_1.fq.gz',
        '../data/fastq/NM2_2.fq.gz',
    ),
    (
        '../data/fastq/NM3_1.fq.gz',
        '../data/fastq/NM3_2.fq.gz',
    ),
    (
        '../data/fastq/WC1_1.fq.gz',
        '../data/fastq/WC1_2.fq.gz',
    ),
    (
        '../data/fastq/WC2_1.fq.gz',
        '../data/fastq/WC2_2.fq.gz',
    ),
    (
        '../data/fastq/WC3_1.fq.gz',
        '../data/fastq/WC3_2.fq.gz',
    ),
    (
        '../data/fastq/WM1_1.fq.gz',
        '../data/fastq/WM1_2.fq.gz',
    ),
    (
        '../data/fastq/WM2_1.fq.gz',
        '../data/fastq/WM2_2.fq.gz',
    ),
    (
        '../data/fastq/WM3_1.fq.gz',
        '../data/fastq/WM3_2.fq.gz',
    ),
)

In [None]:
kallisto_output_directory_path = '../output/kallisto'

ccal.establish_path(
    kallisto_output_directory_path,
    'directory',
)

In [None]:
tpms = []

for fastq in fastqs:
    
    if isinstance(
        fastq,
        str,
    ):
        
        sample_id = fastq.split(sep='/')[-1]
    
    else:

        sample_id = os.path.commonprefix((
            fastq[0].split(sep='/')[-1],
            fastq[1].split(sep='/')[-1],
        ))
    
    sample_id = sample_id.strip('_.')
    
    print(sample_id)

    abundance_file_path = '{}/{}/abundance.tsv'.format(
        kallisto_output_directory_path,
        sample_id,
    )

    if not os.path.isfile(abundance_file_path):

        ccal.count_transcripts_using_kallisto_quant(
            fastq,
            '../data/Homo_sapiens.GRCh38.cdna.all.fa.gz',
            '{}/{}'.format(
                kallisto_output_directory_path,
                sample_id,
            ),
            n_job=N_JOB,
        )

    tpm = pd.read_table(
        abundance_file_path,
        index_col=0,
    )['tpm']
    
    tpm.name = sample_id
    
    tpms.append(tpm)
    
enst_x_sample = pd.concat(
    tpms,
    axis=1,
)

enst_x_sample.index.name = 'ENST'

enst_x_sample.columns.name = 'Sample'

enst_x_sample

In [None]:
sample_id_sample_name_file_path = '../data/sample_id_sample_name.tsv'

if os.path.isfile(sample_id_sample_name_file_path):
    
    id_name = pd.read_table(
        sample_id_sample_name_file_path,
        index_col=0,
        squeeze=True,
    )
    
    enst_x_sample.columns = enst_x_sample.columns.map(id_name.to_dict())

enst_x_sample

In [None]:
gene_x_sample = enst_x_sample.copy()

enst = pd.read_table(
    '../data/enst.tsv',
    index_col=1,
)

gene_x_sample.index = enst_x_sample.index.map(enst['Gene name'].to_dict().get)

gene_x_sample.index.name = 'Gene'

gene_x_sample = gene_x_sample.loc[~gene_x_sample.index.isna()]

gene_x_sample

In [None]:
gene_x_sample__max = gene_x_sample.groupby(level=0).max()

gene_x_sample__max.sort_index(inplace=True)

gene_x_sample__max

In [None]:
gene_x_sample__processed = ccal.read_and_process_feature_x_sample(
    gene_x_sample__max,
    nanize=0,
    drop_na_axis=1,
    min_n_not_na_unique_value=1,
    log_base='2',
    plot=False,
)

columns = tuple(gene_x_sample__processed.iloc[:, i] for i in range(gene_x_sample__processed.shape[1]))

ccal.plot_distributions(
    columns,
    names=gene_x_sample__processed.columns,
    title='Column Value Distribution',
    xaxis_title='Column Value',
)

gene_x_sample__processed.to_csv(
    '../output/gene_x_sample__processed.tsv',
    sep='\t',
)

gene_x_sample__processed