### Installation

`conda create -n ATACseq_SnapATAC python r-essentials jupyter pip pysam pybedtools -y`  
`pip install snaptools`  
`conda install -c bioconda bioconductor-rhdf5 bioconductor-rhdf5lib -y`  
`conda install -c vtraag leidenalg -y`
`conda install python-igraph -y`

###  Import packages

In [1]:
library(SnapATAC);
library(GenomicRanges);

Loading required package: Matrix
Loading required package: rhdf5
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: ‘BiocGenerics’

The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from ‘package:SnapATAC’:

    colMeans, colSums, rowMeans, rowSums

The following objects are masked from ‘package:Matrix’:

    colMeans, colSums, rowMeans, rowSums, which

The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs

The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colMeans,
    colnames, colSums, dirname, do.call, duplicated, eval, evalq,
    Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply,
    le

In [2]:
packageVersion("SnapATAC")

[1] ‘1.0.0’

### Import leiden

In [3]:
library('leiden')

### Load Data

In [4]:
load(file = '../../run_methods/SnapATAC/SnapATAC_cusanovich2018subset.RData')

#### Clustering (SnapATAC)

In [5]:
set.seed(2019) # this function is stochastic even after setting seed, so the number of clusters cannot be guaranteed for a specific resolution

In [6]:
getNClusters = function(adata,n_cluster,range_min=0,range_max=2,max_steps=10)
{
    this_step = 0
    this_min = range_min
    this_max = range_max
    while (this_step < max_steps)
    {
        print(paste('step ',this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        adata = runCluster(
            obj=adata,
            tmp.folder=tempdir(),
            louvain.lib="leiden",
            seed.use=10,
            resolution=this_resolution
            );
        this_clusters = length(levels(adata@cluster))
        
        print(paste('got ', this_clusters , ' at resolution ' , this_resolution))
        
        if (this_clusters > n_cluster)
        {
            this_max = this_resolution   
        } else if (this_clusters < n_cluster)
        {
            this_min = this_resolution
        } else
        {
            return(adata)
        }
        this_step = this_step + 1
    }
    
    print('Cannot find the number of clusters')
    print(paste('Clustering solution from last iteration is used:' , this_clusters , ' at resolution ' , this_resolution))
}

In [7]:
nClusters = length(levels(as.factor(metadata$label)))

x.sp = getNClusters(x.sp,n_cluster=nClusters)


#resolution    clusters
#      0.1     8
#     0.18     10
#      0.2     11
#      0.3     13
#      0.5     16
#      0.8     20
#        1     23
#        6     50

[1] "step  0"
[1] "got  22  at resolution  1"
[1] "step  1"
[1] "got  19  at resolution  0.5"
[1] "step  2"
[1] "got  15  at resolution  0.25"
[1] "step  3"
[1] "got  12  at resolution  0.125"
[1] "step  4"
[1] "got  13  at resolution  0.1875"


In [8]:
df_pre = data.frame('SnapATAC'=x.sp@cluster)
rownames(df_pre) = as.character(x.sp@metaData$barcode)
df_pre$ord = 1:nrow(df_pre)
df_pre = df_pre[as.character(rownames(metadata)),]
df_out = data.frame('SnapATAC'=df_pre[,'SnapATAC'])
rownames(df_out) = rownames(df_pre)
write.table(df_out,file="clusteringSolution.tsv", quote=FALSE, sep='\t', col.names = NA)