SET PARAMETERS

In [None]:
# initialize params
DEBUG = True
saveReport = False
toPrint = True
reportName = 'notebook'
txt_label = "Classification of integrated c1 and c2 CNVs samples"
sample_class_column = "Relapsed"
class_labels = ["relapsed","NOTrelapsed"]
class_values = [1,0]

# classification params
split_train_size = 20
split_random_state = 0

# plotting params
plot_kwargs = {}
function_dict = None
with_swarm = False
highRes = False
if highRes:
    img_ext = '.pdf'
else:
    img_ext = '.png'
cmap_custom = None
vmin, vmax = (-2, +2)

In [None]:
# file paths
data_fpath = "output/headneck/integrate_cohorts/c1c2/CNV_mapped_filt/integrated_data.csv"

cnv_feature_fpaths = [
    "output/feature_selection/",
    "output/feature_selection/",
    "output/feature_selection/",
]

sample_info_fpath = "input/headneck/integrate_cohorts/integrated_sample_info.csv"
gene_info_fpath = "input/headneck/integrate_cohorts/gene_info.csv"
output_directory = "output/headneck/classification"

sample_info_read_csv_kwargs = {
    "sep": "\t",
    "header": 0,
    "index_col": 0,
    "col_as_index":"cnvID"
}

SET ENVIRONMENT

In [None]:
# custom imports
from omics_processing.io import (
    set_directory, load_clinical
)
from omics_processing.remove_duplicates import (
    remove_andSave_duplicates
)
from gene_signatures.core import (
    custom_div_cmap,
    get_chr_ticks,
    choose_samples,
    parse_arg_type,
    boxplot,
    set_heatmap_size,
    set_cbar_ticks,
    edit_names_with_duplicates
)

# basic imports
import os, sys
import numpy as np
import pandas as pd
import json
from scipy.spatial.distance import pdist, squareform
from natsort import natsorted, index_natsorted
import math
import logging
from sklearn import linear_model
from sklearn import svm
from distutils.util import strtobool
from scipy.stats import binom_test
from sklearn.externals import joblib
from sklearn.model_selection import StratifiedKFold, train_test_split

# plotting imports
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
sns.set_context('poster')

script_path = os.getcwd()
logger = logging.getLogger(__name__)

START ANALYSIS

In [None]:
MainDataDir

In [None]:
if DEBUG:
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

In [None]:
# properly set file paths
try:
    os.path.exists(MainDataDir)
except:
    MainDataDir = os.path.join(script_path, '..','..', 'data')
    logger.debug("set MainDataDir:\n"+MainDataDir)

# data input
if not os.path.exists(data_fpath):
    data_fpath = os.path.join(*data_fpath.rsplit('/'))
    data_fpath = os.path.join(MainDataDir, data_fpath)
    logger.debug("set data_fpath:\n"+data_fpath)

# sample info input
if not os.path.exists(sample_info_fpath):
    sample_info_fpath = os.path.join(*sample_info_fpath.rsplit('/'))
    sample_info_fpath = os.path.join(MainDataDir, sample_info_fpath)
    logger.debug("set sample_info_fpath:\n"+sample_info_fpath)

# gene info input
if not os.path.exists(gene_info_fpath):
    gene_info_fpath = os.path.join(*gene_info_fpath.rsplit('/'))
    gene_info_fpath = os.path.join(MainDataDir, gene_info_fpath)
    logger.debug("set gene_info_fpath:\n"+gene_info_fpath)

# data output
if not os.path.exists(output_directory):
    output_directory = os.path.join(*output_directory.rsplit('/'))
    output_directory = set_directory(
        os.path.join(MainDataDir, output_directory, reportName)
    )
    logger.debug("set output_directory:\n"+output_directory)

In [None]:
if (cmap_custom is None) and (vmin is not None) and (vmax is not None):
    custom_div_cmap_arg = abs(vmin)+abs(vmax)
    if (vmin <= 0) and (vmax >= 0):
        custom_div_cmap_arg = custom_div_cmap_arg + 1
    mincol = plot_kwargs.get('mincol', None)
    midcol = plot_kwargs.get('midcol', None)
    maxcol = plot_kwargs.get('maxcol', None)
    if (
            (mincol is not None) and
            (midcol is not None) and
            (maxcol is not None)
            ):
        cmap_custom = custom_div_cmap(
            numcolors=custom_div_cmap_arg,
            mincol=mincol, midcol=midcol, maxcol=maxcol)
    else:
        cmap_custom = custom_div_cmap(numcolors=custom_div_cmap_arg)

In [None]:
# load data
data = pd.read_csv(data_fpath, sep='\t', header=0, index_col=0)
logger.info('loaded data file with shape: '+str(data.shape))

In [None]:
# load info table of samples
sample_info = load_clinical(
    sample_info_fpath, **sample_info_read_csv_kwargs)
logger.info('loaded sample_info file with shape: '+str(sample_info.shape))

In [None]:
# load info table of genes
gene_info = pd.read_csv(gene_info_fpath, sep='\t', header=0, index_col=0)
logger.info('loaded gene_info file with shape: '+str(gene_info.shape))

In [None]:
# set the ground truth
ground_truth = sample_info.loc[data.index, sample_class_column]