In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from environment import *
from setting import *

In [None]:
def get_sample_id_from_fastq_input(fastq_input):

    if isinstance(fastq_input, str):

        sample_id = fastq_input.split(sep="/")[-1]

    else:

        sample_id = os.path.commonprefix(
            (fastq_input[0].split(sep="/")[-1], fastq_input[1].split(sep="/")[-1])
        )

    return sample_id.strip("_.")

In [None]:
for fastq_input in FASTQ_INPUTS:

    sample_id = get_sample_id_from_fastq_input(fastq_input)

    ccal.count_transcripts_using_kallisto_quant(
        fastq_input,
        REFERENCE_CDNA_FASTA_FILE_PATH,
        "{}/{}".format(PATH_DICT["kallisto_directory_path"], sample_id),
        n_job=N_JOB,
    )

In [None]:
tpms = []

for fastq_input in FASTQ_INPUTS:

    sample_id = get_sample_id_from_fastq_input(fastq_input)

    tpm = pd.read_table(
        "{}/{}/abundance.tsv".format(PATH_DICT["kallisto_directory_path"], sample_id),
        index_col=0,
    )["tpm"]

    tpm.name = sample_id

    tpms.append(tpm)

enst_x_sample = pd.concat(tpms, axis=1)

assert not enst_x_sample.index.has_duplicates

enst_x_sample.index.name = "ENST"

assert not enst_x_sample.columns.has_duplicates

enst_x_sample.columns.name = "Sample"

enst_x_sample

In [None]:
if os.path.isfile(PATH_DICT["sample_id_sample_name_file_path"]):

    enst_x_sample.columns = enst_x_sample.columns.map(
        pd.read_table(
            PATH_DICT["sample_id_sample_name_file_path"], index_col=0, squeeze=True
        )
    )

enst_x_sample.to_csv("../output/enst_x_sample.tsv", sep="\t")

enst_x_sample

In [None]:
gene_x_sample = enst_x_sample.copy()

enst_gene_name = pd.read_table(PATH_DICT["enst_gene_name_file_path"])

gene_x_sample.index = enst_x_sample.index.map(
    dict(
        zip(
            enst_gene_name["Transcript stable ID version"],
            enst_gene_name["Gene name"].str.upper(),
        )
    )
)

gene_x_sample = gene_x_sample.loc[~gene_x_sample.index.isna()]

gene_x_sample.sort_index(inplace=True)

gene_x_sample.index.name = "Gene"

gene_x_sample

In [None]:
gene_x_sample__max = gene_x_sample.groupby(level=0).max()

gene_x_sample__max

In [None]:
gene_x_sample__processed = ccal.read_and_process_feature_x_sample(
    gene_x_sample__max,
    nanize=0,
    drop_na_axis=1,
    # max_na=0,
    min_n_not_na_unique_value=1,
    log_base="2",
    plot=False,
)

gene_x_sample__processed.to_csv(PATH_DICT["gene_x_sample_file_path"], sep="\t")

column_values = tuple(
    gene_x_sample__processed.iloc[:, i]
    for i in range(gene_x_sample__processed.shape[1])
)

colors = ccal.plot.plot.style.COLOR_CATEGORICAL

if len(colors) < len(column_values):

    colors = tuple(ccal.make_random_color("hex") for i in range(len(column_values)))

ccal.plot_distributions(
    column_values,
    names=gene_x_sample__processed.columns,
    colors=colors,
    plot_rug=False,
    title="Column Value Distribution",
    xaxis_title="Column Value",
)

gene_x_sample__processed