In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
from environment import *

from setting import *

In [None]:
kallisto_output_directory_path = '../output/kallisto'

ccal.establish_path(
    kallisto_output_directory_path,
    'directory',
)

In [None]:
tpms = []

for fastq_input in FASTQ_INPUTS:
    
    if isinstance(
        fastq_input,
        str,
    ):
        
        sample_id = fastq_input.split(sep='/')[-1]
        
    else:
        
        sample_id = os.path.commonprefix((
            fastq_input[0].split(sep='/')[-1],
            fastq_input[1].split(sep='/')[-1],
        ))
        
    sample_id = sample_id.strip('_.')
    
    print(sample_id)
    
    abundance_file_path = '{}/{}/abundance.tsv'.format(
        kallisto_output_directory_path,
        sample_id,
    )
    
    if not os.path.isfile(abundance_file_path):
        
        ccal.count_transcripts_using_kallisto_quant(
            fastq_input,
            '../data/Homo_sapiens.GRCh38.cdna.all.fa.gz',
            '{}/{}'.format(
                kallisto_output_directory_path,
                sample_id,
            ),
            n_job=N_JOB,
        )
        
    tpm = pd.read_table(
        abundance_file_path,
        index_col=0,
    )['tpm']
    
    tpm.name = sample_id
    
    tpms.append(tpm)
    
enst_x_sample = pd.concat(
    tpms,
    axis=1,
)

enst_x_sample.index.name = 'ENST'

enst_x_sample.columns.name = 'Sample'

enst_x_sample

In [None]:
sample_id_sample_name_file_path = '../data/sample_id_sample_name.tsv'

if os.path.isfile(sample_id_sample_name_file_path):
    
    id_name = pd.read_table(
        sample_id_sample_name_file_path,
        index_col=0,
        squeeze=True,
    )
    
    enst_x_sample.columns = enst_x_sample.columns.map(id_name.to_dict())
    
enst_x_sample

In [None]:
gene_x_sample = enst_x_sample.copy()

enst = pd.read_table(
    '../data/enst.tsv',
    index_col=1,
)

gene_x_sample.index = enst_x_sample.index.map(enst['Gene name'].to_dict())

gene_x_sample.index.name = 'Gene'

gene_x_sample = gene_x_sample.loc[~gene_x_sample.index.isna()]

gene_x_sample

In [None]:
gene_x_sample__max = gene_x_sample.groupby(level=0).max()

gene_x_sample__max.sort_index(inplace=True)

gene_x_sample__max

In [None]:
gene_x_sample__processed = ccal.read_and_process_feature_x_sample(
    gene_x_sample__max,
    nanize=0,
    drop_na_axis=1,
    min_n_not_na_unique_value=1,
    log_base='2',
    plot=False,
)

gene_x_sample__processed.to_csv(
    '../output/gene_x_sample__processed.tsv',
    sep='\t',
)

columns = tuple(gene_x_sample__processed.iloc[:, i] for i in range(gene_x_sample__processed.shape[1]))

colors = ccal.plot.plot.style.CATEGORICAL_COLORS

if len(colors) < len(columns):
    
    colors = tuple(ccal.make_random_color('hex') for i in range(len(columns)))
    
ccal.plot_distributions(
    columns,
    names=gene_x_sample__processed.columns,
    colors=colors,
    plot_rug=False,
    title='Column Value Distribution',
    xaxis_title='Column Value',
)

gene_x_sample__processed