In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from scipy.stats import norm
from sklearn.model_selection import train_test_split

import tensorflow as tf 
from tensorflow.python.keras.layers import Input, Dense, Lambda, Flatten, Reshape, Concatenate
from tensorflow.python.keras.layers import Conv2D, Conv2DTranspose, BatchNormalization, Activation, LeakyReLU
from tensorflow.python.keras.models import Model
import tensorflow.keras.backend as K
from tensorflow.python.keras import metrics
from tensorflow.python.keras.callbacks import ModelCheckpoint, LearningRateScheduler
from tensorflow.python.keras.datasets import mnist
from tensorflow.python.keras.models import Sequential, Model
from tensorflow.python.keras.optimizers import Adam

import pdb
import os

from train_model_template.helpers_vizualisation import eval_knn_proportions
from train_model_template.helpers_vizualisation import plot_tsne
from train_model_template.helpers_vizualisation import plot_umap


In [None]:
# note: after discussion last time the data has one concatanated index with all the information
def load_data_basic(path, patient='sample1', batch_names = ['batch1', 'batch2'], seed=42,
                   n_cells_to_select = 500):
    """
    Function to load data and split into 2 inputs with train and test sets
    inputs:
        path: path to the data file
        patient: name of the patient to consider
        batch_names: a list of batch names to split the data
        n_cells_to_select: number of cells to select for quicker runs, if 0 then all cells are selected
    outputs:
        x1_train, x1_test: train and test sets form the first batch
        x2_train, x2_test: train and test sets form the second batch    
    """
    df = pd.read_parquet(path, engine='pyarrow')
    selected_cols = [col for col in df.columns if not "metadata" in col]
    df = df.loc[:,selected_cols]
    idx = df.index.get_values()
    x1_idx = [x for x in idx if patient in x and batch_names[0] in x and patient+'0' not in x][0]
    x1 = df.loc[x1_idx,:].copy()
    x2_idx = [x for x in idx if patient in x and batch_names[1] in x and patient+'0' not in x][0]
    x2 = df.loc[x2_idx,:].copy()
    if(n_cells_to_select>0):
        cells_to_select = np.random.uniform(0,x1.shape[0], n_cells_to_select)
        x1 = x1.iloc[cells_to_select, :]
        cells_to_select = np.random.uniform(0,x2.shape[0], n_cells_to_select)
        x2 = x2.iloc[cells_to_select, :]
    x1_train, x1_test = train_test_split(x1, test_size=0.2, random_state=42)
    x2_train, x2_test = train_test_split(x2, test_size=0.2, random_state=42)
    return(x1_train, x1_test, x2_train, x2_test)


In [None]:
############# DATA LOADING ############# 

In [None]:
path = os.getcwd()
path = path+'/toy_data_gamma_small.parquet' # '/toy_data_gamma_large.parquet'
x1_train, x1_test, x2_train, x2_test = load_data_basic(path, patient='sample1', batch_names = ['batch1', 'batch2'], seed=42,
                                                      n_cells_to_select=0)

In [None]:
x1_train.shape

In [None]:
#############    MODEL     #############

In [None]:
############# VISUALIZATIONS ############# 

In [None]:
# note: x1_test should be replaced with with model output
plot_tsne(x1_test, do_pca = True, n_plots = 2, iter_ = 500, pca_components = 20)

In [None]:
plot_umap(x1_test)

In [None]:
#eval_knn_proportions(x1_test)