In [1]:
import numpy as np
import pandas as pd
import sys

sys.path.append('../../')
from utils import expr_data_utils, file_utils

In [2]:
num_cores = 8

In [3]:
norm_types = ['min_max', 'z_score']

In [4]:
datasets = {'microarray' : '../../active_files/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv',
            'rna_seq': '../../active_files/rna_seq.csv'}

In [5]:
print(f'conda activate tgne.env')

for norm_type in norm_types:
    for dataset, dataset_path in datasets.items():

        df = pd.read_csv(dataset_path)

        file_utils.create_directories(f'{dataset}_clr_networks/{norm_type}/')

        if dataset == 'microarray':
            full_filtered_norm_df = expr_data_utils.normalize_expression_per_gene(df, norm_type=norm_type, add_scalar=0)
            full_filtered_norm_df = expr_data_utils.mean_df_of_duplicates(full_filtered_norm_df, mean_type='arithmetic')
        elif dataset == 'rna_seq':
            full_filtered_norm_df = expr_data_utils.normalize_expression_per_gene(df, norm_type=norm_type, add_scalar=1)
            full_filtered_norm_df = expr_data_utils.mean_df_of_duplicates(full_filtered_norm_df, mean_type='arithmetic')

        norm_avg_dataset_path = f'./norm_avg_{dataset}_{norm_type}.csv'

        full_filtered_norm_df.to_csv(norm_avg_dataset_path, index=False)

        for nn in np.arange(2,13):
            rmd_file_str = f'''---
    title: "clr_for_distances"
    author: "Michael Bertagna"
    date: "2024-02-02"
    output: html_document
    ---

    ```{{r setup, include=FALSE}}
    knitr::opts_chunk$set(echo = TRUE)
    ```

    ```{{r}}
    setwd("~/git/TGNE-2022/TGNE/clustering_optimization")
    library(parmigene)
    Sys.getenv("OMP_NUM_THREADS")
    ```

    IMPORT THE FILTERED DATA AS A DATAFRAME AND TAKE THE TRANSPOSE

    ```{{r}}
    data <- read.csv("{norm_avg_dataset_path}", header = TRUE, row.names = 1)

    nn <- {nn}

    data_knn <- knnmi.all(data, k=nn)
    ```

    BUILD THE NETWORK

    ```{{r}}
    clr_network <- clr(data_knn)
    ```

    ```{{r}}
    write.csv(as.data.frame(clr_network), file = paste('./{dataset}_clr_networks/{norm_type}/clr_network_for_distances_', nn, '.csv', sep = ''), row.names = TRUE)
    ```
    '''
            file_name = f'./{dataset}_{norm_type}_clr_for_distances_{str(nn).zfill(2)}nn.Rmd'
            with open(file_name, 'w') as f:
                f.write(rmd_file_str)
                print(f'Rscript -e \"rmarkdown::render(\'TGNE/clustering_optimization/{file_name.replace("./", "")}\')\"')
                print(f'pigz -p {num_cores} TGNE/clustering_optimization/{dataset}_clr_networks/{norm_type}/clr_network_for_distances_{nn}.csv')
                print()

print('conda deactivate')

conda activate tgne.env
Rscript -e "rmarkdown::render('TGNE/clustering_optimization/microarray_min_max_clr_for_distances_02nn.Rmd')"
pigz -p 8 TGNE/clustering_optimization/microarray_clr_networks/min_max/clr_network_for_distances_2.csv

Rscript -e "rmarkdown::render('TGNE/clustering_optimization/microarray_min_max_clr_for_distances_03nn.Rmd')"
pigz -p 8 TGNE/clustering_optimization/microarray_clr_networks/min_max/clr_network_for_distances_3.csv

Rscript -e "rmarkdown::render('TGNE/clustering_optimization/microarray_min_max_clr_for_distances_04nn.Rmd')"
pigz -p 8 TGNE/clustering_optimization/microarray_clr_networks/min_max/clr_network_for_distances_4.csv

Rscript -e "rmarkdown::render('TGNE/clustering_optimization/microarray_min_max_clr_for_distances_05nn.Rmd')"
pigz -p 8 TGNE/clustering_optimization/microarray_clr_networks/min_max/clr_network_for_distances_5.csv

Rscript -e "rmarkdown::render('TGNE/clustering_optimization/microarray_min_max_clr_for_distances_06nn.Rmd')"
pigz -p 8 TGNE/