# Run validation experiment

In [1]:
%load_ext rpy2.ipython

### Simulate the data

In [None]:
%%bash
export WD=./Simulation_result
export CONF=$(pwd)/Exp_conf
export SCRIPT=$(pwd)/../Simulation_Script/simulation.py

mkdir ./Experiment_out
mkdir $WD
cd $WD

for CONFIG in $CONF/*.json
do
    python3 $SCRIPT $CONFIG
done

### Traverse the simulation directory and enumerate all the data

In [None]:
%%R

groups <- list.dirs(path = "./Simulation_result", full.names = TRUE, recursive = FALSE)

datalist <- c()
x <- c("Group","Experiment", "Pattern", "Peak", "Path")
for (group in groups){
    group_name <- basename(group)
    experiments <- list.dirs(path = group, full.names = TRUE, recursive = FALSE)
    for (experiment in experiments){
        exp_name <- basename(experiment)
        patterns <- list.dirs(path = experiment, full.names = TRUE, recursive = FALSE)
        for (pattern in patterns) {
            pattern_name <- basename(pattern)
            peaks <- list.dirs(path = pattern, full.names = TRUE, recursive = FALSE)
            for (peak in peaks) {
                peak_name <- basename(peak)
                peak_file <- list.files(path = peak, full.names = TRUE, recursive = FALSE)[1]
                datalist <- rbind(datalist, c(group_name, exp_name, pattern_name, peak_name, peak_file))
            }
        }
    }
}

data_df <- data.frame(datalist)
colnames(data_df) <- x

### Load the necessary libraries

In [None]:
%%R
library(MMDiff2)
library(reshape2)
library(ggplot2)
library(GenomicRanges)
library(DiffBind)
library(devtools)

### Run MMD analysis on all peaks 

In [None]:
%%R

experiment_groups <- as.list(unique(data_df[['Group']]))

for (exp_group in experiment_groups){

    group_df <- subset(data_df, Group == exp_group)
    group_dir <- paste0('./Experiment_out','/',exp_group)
    dir.create(paste0(group_dir,'/'))
    
    n_mods <- 10

    modnames <- sapply(0:9, function(i) paste0('mod_',i))
    samplesheet <- data.frame(list(modnames, modnames, rep(1, 10), rep('Ctr', 10), rep(1,10), rep('macs', 10)))
    colnames(samplesheet) <- c('SampleID', 'Tissue', 'Factor', 'Condition', 'Replicate', 'PeakCaller')
    write.csv(samplesheet, paste0(group_dir,'/','SampleSheet.csv'), row.names=FALSE)
    replicas <- as.list(unique(group_df[['Experiment']]))
                     
    for (replica in replicas){
        replica_df <- subset(group_df, Experiment == replica)
        peaks <- as.list(unique(replica_df[['Peak']]))
        patterns <- as.list(unique(replica_df[['Pattern']]))
                
        ExperimentData <- list(genome='none',
                                    dataDir='.',
                                    sampleSheet =paste0(group_dir,'/','SampleSheet.csv'))
        MetaData <- list('ExpData' = ExperimentData)
        MMD <- DBAmmd(MetaData)
        
        chroms <- c()
        for (pattern in patterns) {
            pchr <- sapply(c(0:(length(peaks)-1)), function (x) paste0(pattern, "_", x))
            chroms <- c(chroms, pchr)
        }
        
        peak_df <- data.frame(chr=chroms)
        peak_df$start <- 0
        peak_df$end <- 4100
                           
        regions <- makeGRangesFromDataFrame(peak_df)
        MMD <- setRegions(MMD, regions)
                           
        Meta <- metaData(MMD)
        Meta$AnaData$pairedEnd <- rep(FALSE, n_mods)
        Meta$AnaData$PeakBoundary <- 200
        MMD@MetaData <- Meta
                           
        read_df <- data.frame()
        
        dir.create(paste0(group_dir,"/",replica))

        for (pattern in patterns) { 
            pattern_df = subset(replica_df, Pattern == pattern)
            dirname <- paste0(group_dir,"/",replica,"/",pattern)
            dir.create(dirname)

            read_data = by(data = pattern_df$Path,
                            INDICES=pattern_df$Peak,
                            FUN=function(x) data.frame(read.csv(file=paste(x), header=TRUE, sep=",")))
            
            tmp_df<-data.frame()
            for(i in c(0:(length(peaks)-1))){
                peak_n <- paste0("peak_",i)
                rdf <- read_data[[peak_n]]
                rdf$peak <- paste0(pattern, "_", i)
                tmp_df<-rbind(tmp_df, rdf)
            }
            
            read_df<-rbind(read_df, tmp_df)
        }
        
        readList <- list()
        rawCounts = matrix(, nrow = length(peaks)*length(patterns), ncol = n_mods)
        
        rnames <- sapply(chroms, function (x) paste0("chr",x,":0-4100"))
        cnames <- sapply(c(0:(n_mods-1)), function (x) paste0("mod_",x))
                        
        rownames(rawCounts) <- rnames
        colnames(rawCounts) <- cnames

        for(i in c(0:(n_mods-1))) {
            col <- paste0("mod_",i)
            tempList = list() 
            j <- 0
            for(chr_n in chroms){
                row <- paste0("chr",chr_n, ":0-4100")
                p_df <- subset(read_df, peak==chr_n)
                df <- subset(p_df, modification==i)
                tempList[[row]] <- (as.numeric(df$position)+200+1)
                rawCounts[row, col] <- nrow(df)
                j <- j+1
            }
            readList[[col]] <- tempList
        }

        MMD@Reads <- list('Center'=readList)
        MMD@RawTotalCounts <- rawCounts
        print("starting mmd")
        MMD <- compDists(MMD,dist.method='MMD2', background_intensity=0.25, bootstrap_n=0)
        #MMD <- compDists(MMD,dist.method='MMD')
        #dists <-MMD@DISTs$MMD
        dists2 <-MMD@DISTs$MMD2
        print("writing mmd")
        for (pattern in patterns){
            dirname <- paste0(group_dir,"/",replica,"/",pattern)
            pchr <- sapply(c(0:(length(peaks)-1)), function (x) paste0(pattern, "_", x))
            p_rows <- sapply(pchr, function (x) paste0("chr",x, ":0-4100"))
            pdists2 <- dists2[p_rows, ]
            write.csv(pdists2, file = paste0(dirname, "/MMD_dists.csv"))
        }        
    }
}