In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from environment import *

with open("setting.yaml") as yaml_file:

    SETTING = yaml.load(yaml_file)

PATH = make_path_dict(SETTING)

In [None]:
for sample_id, fastqs in SETTING["fastqs"].items():

    ccal.count_transcripts_using_kallisto_quant(
        fastqs,
        SETTING["reference_cdna_fasta_file_path"],
        "{}/{}".format(PATH["kallisto/"], sample_id),
        n_job=SETTING["n_job"],
    )

In [None]:
tpms = []

for sample_id, fastqs in SETTING["fastqs"].items():

    tpm = pd.read_csv(
        "{}/{}/abundance.tsv".format(PATH["kallisto/"], sample_id),
        sep="\t",
        index_col=0,
    )["tpm"]

    tpm.name = sample_id

    tpms.append(tpm)

enst_x_sample = ccal.clean_and_write_df_to_tsv(
    pd.concat(tpms, axis=1), "ENST", PATH["enst_x_sample.tsv"]
)

enst_x_sample

In [None]:
gene_x_sample = enst_x_sample.copy()

enst_gene_name = pd.read_csv(SETTING["enst_gene_name_file_path"], sep="\t")

gene_x_sample.index = enst_x_sample.index.map(
    dict(
        zip(
            enst_gene_name["Transcript stable ID version"],
            enst_gene_name["Gene name"].str.upper(),
        )
    )
)

gene_x_sample = gene_x_sample.loc[~gene_x_sample.index.isna()]

gene_x_sample__processed = ccal.clean_and_write_df_to_tsv(
    ccal.process_feature_x_sample(
        gene_x_sample.groupby(level=0).mean(),
        features_to_drop=SETTING["features_to_drop"],
        samples_to_drop=SETTING["samples_to_drop"],
        nanize=SETTING["nanize"],
        drop_axis=SETTING["drop_axis"],
        max_na=SETTING["max_na"],
        min_n_not_na_unique_value=SETTING["min_n_not_na_unique_value"],
        shift_as_necessary_to_achieve_min_before_logging=SETTING[
            "shift_as_necessary_to_achieve_min_before_logging"
        ],
        log_base=SETTING["log_base"],
        normalization_axis=SETTING["normalization_axis"],
        normalization_method=SETTING["normalization_method"],
        clip_min=SETTING["clip_min"],
        clip_max=SETTING["clip_max"],
        plot_heat_map_max_size=SETTING["plot_heat_map_max_size"],
        plot_distributions_max_size=SETTING["plot_distributions_max_size"],
        plot_rug_max_size=SETTING["plot_rug_max_size"],
    ),
    "Gene",
    PATH["gene_x_sample.processed.tsv"],
)

gene_x_sample__processed

In [None]:
column_values = tuple(
    gene_x_sample__processed.iloc[:, i]
    for i in range(gene_x_sample__processed.shape[1])
)

colors = ccal.COLOR_CATEGORICAL

if len(colors) < len(column_values):

    colors = tuple(ccal.make_random_color("hex") for i in range(len(column_values)))

ccal.plot_distributions(
    column_values,
    names=gene_x_sample__processed.columns,
    colors=colors,
    plot_rug=False,
    title="Column Value Distribution",
    xaxis_title="Column Value",
)