In [1]:
# Some setup
%load_ext autoreload

%autoreload 2

import altair
altair.data_transformers.disable_max_rows()

def custom_theme():
    return {
        'config': {
            'view': {
                'height': 300,
                'width': 400,
            },
            'range': {
                'category': {'scheme':'dark2'}
            }
        }
    }

altair.themes.register('custom_theme', custom_theme)

# enable the newly registered theme
altair.themes.enable('custom_theme')

ThemeRegistry.enable('custom_theme')

# Code name: Cocktail

A kmer tools box to use small kmer on long-reads.

- count kmer: pcon
- filter reads: kmerf
- correct reads: br
- polish assemblies: br

## Parameter

Some global variable usefull in this notebook don't hesitate to set it 

In [None]:
SNAKEMAKE_CORE=8

## Dataset

### References

| dataset         | species            | info                          | genome size | 
|:----------------|:-------------------|:------------------------------|------------:|
| bacteria        | E. coli str. K-12 substr. MG1655 | NC_000913.3                   |      4.6 Mb |
| yeast           | S. cerevisiae            | NC_001133                     |     12.1Mb |
| metagenome      | Mock community           | [pacbio assembly](https://lomanlab.github.io/mockcommunity/)       |       42.1Mb |



### Reads

| dataset         | info           | # bases (Gb) | coverage    | error rate |
|:----------------|:---------------|-------------:|------------:|-----------:|
| bacteria        | SRR10950300    |       588    |        126x |       14 % |
| bacteria5       | TODO           |       250    |         54x |        6 % |
| bacteria7       | TODO           |       591    |        127x |        7 % |
| yeast           | Genoscope      |       34,474 |        283x |        8 % |
| metagenome      | [R10 Native “3 Peaks” Data Release (2019-05-24)](https://lomanlab.github.io/mockcommunity/r10.html) |   30,909 | 733x | 10 %|


## Prepare dataset

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p data_all -n

In [None]:
import cocktail
import cocktail.kmer_spectrum

df = cocktail.kmer_spectrum.get_kmer_spectrum("bacteria5", 15)

In [None]:
df2 = df
df2 = df2[df2.count != 0]
df2 = df2[df2.type != "all"]
df2 = df2.replace("true", "reference")
df2 = df2.replace("false", "erroneous")

cocktail.kmer_spectrum.figure(df2)

## Count

Compare pcon against kmc (3.1.1) and jellyfish (2.2.10) k in range(13, 19, 2)), on all dataset.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p count_all -n

### Benchmark

In [11]:
import cocktail
import cocktail.count
import cocktail.utils

df = cocktail.count.dataframe()

all_fig = list()
for dataset in set(df.dataset):
    all_fig.append(cocktail.utils.group_scatter(df[df.dataset == dataset], "time", "memory", "counter", "kmer_size", dataset))

fig = ((all_fig[0] | all_fig[1]) & (all_fig[2] | all_fig[3])) & all_fig[4]

fig

## Filter

Evaluate effect of kmerf on dataset and on assembly. With different k and different ratio.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p filter_all -n

### Benchmark

In [13]:
import cocktail
import cocktail.filter
import cocktail.utils

df = cocktail.filter.dataframe_bench()

all_fig = list()
for dataset in set(df.dataset):
    all_fig.append(cocktail.utils.group_scatter(df[df.dataset == dataset], "time", "memory", "filter", None, dataset))

fig = ((all_fig[0] | all_fig[1]) & (all_fig[2] | all_fig[3])) & all_fig[4]

fig

### Read quality

In [None]:
import pandas
import cocktail
import cocktail.filter

df = cocktail.filter.dataframe()

In [None]:
import cocktail

data_kmrf = list()
for (group, data) in df.groupby(["dataset", "tool"]):
    if group[1].startswith("kmrf_"):
        (kmer_size, ratio) = group[1].split("_")[1:] 
        data_kmrf.append((group[0], kmer_size, ratio, data.length.sum(), data.identity.mean()))
        
df_kmrf = pandas.DataFrame(data_kmrf, columns=["dataset", "kmer_size", "ratio", "length", "identity"])

all_fig = list()
for dataset in set(df_kmrf.dataset):
    all_fig.append(cocktail.utils.group_scatter(df_kmrf[df_kmrf["dataset"] == dataset],
                                                "length", "identity", "kmer_size", "ratio", dataset))

fig = ((all_fig[0] | all_fig[1]) & (all_fig[2] | all_fig[3])) & all_fig[4]

fig

In [None]:
import cocktail

data_filtlong = list()
for (group, data) in df.groupby(["dataset", "tool"]):
    if group[1].startswith("filtlong_"):
        quality = group[1].split("_")[1] 
        data_filtlong.append((group[0], quality, data.length.sum(), data.identity.mean()))

df_filtlong = pandas.DataFrame(data_filtlong, columns=["dataset", "quality", "length", "identity"])

all_fig = list()
for dataset in set(df_filtlong.dataset):
    all_fig.append(cocktail.utils.group_scatter(df_filtlong[df_filtlong["dataset"] == dataset],
                                                "length", "identity", None, "quality", dataset))

fig = ((all_fig[0] | all_fig[1]) )#& (all_fig[2] | all_fig[3])) #& all_fig[4]

fig

In [None]:
import collections

figs = collections.defaultdict(dict)

for dataset in set(df.dataset):
    for tool in set(df.tool):
        sub_df = df[(df.dataset == dataset) & (df.tool == tool)]
        max = sub_df.quantile(0.998).length
        sub_df = sub_df.loc[df['length']<max]
  
        figs[dataset][tool] = cocktail.filter.figure(sub_df)

In [None]:
figs["yeast"]["kmrf_k15_r70"]

In [None]:
import pandas
import cocktail
import cocktail.filter

raw = cocktail.filter.get_data("data/yeast/reads.len_id.tsv", "yeast", "raw")

df_raw = pandas.DataFrame(raw, columns=["dataset", "filter", "name", "length", "identity"])

filter = set(df[(df.dataset == "yeast") & (df.tool == "kmrf_k15_r70")].name)

df_raw.filtered = df_raw.name.apply(lambda x: x not in filter)

max = df_raw.quantile(0.998).length
df_raw = df_raw.loc[df_raw.length<max]

cocktail.filter.figure(df_raw, color="filtered")

### Assembly quality

## Correction

Compare br (with k in range(13, 19, 2)) against canu correction module (2.0), consent and necat on all dataset.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p correct_all -n

### Benchmark

In [9]:
import pandas
import cocktail
import cocktail.correct
import cocktail.utils

df = cocktail.correct.dataframe_bench()

#df = df[df.corrector != "br_k13"]
#df = df[df.corrector != "br_k15"]
#df = df[df.corrector != "br_k17"]

#df = df.replace({"br_k19": "br"})

fig_time = cocktail.utils.group_line(df, "size", "time", color="corrector", point=True,
                                    xtitle="Reads file size in Mb", ytitle="Wall time in second")
fig_mem = cocktail.utils.group_line(df, "size", "memory", color="corrector", point=True,
                                    xtitle="Reads file size in Mb", ytitle="Memory peak in Mb")


fig_time | fig_mem

### Error rate

In [10]:
import pandas
import cocktail
import cocktail.correct
import cocktail.utils

df = cocktail.correct.dataframe_stats()

#df = df[df.corrector != "br_k13"]
#df = df[df.corrector != "br_k15"]
#df = df[df.corrector != "br_k17"]

#df = df.replace({"br_k19": "br"})
df.raw = df.raw * 100
df.corrected = df.corrected * 100

fig_error = cocktail.utils.group_line(df, "raw", "corrected", color="corrector", point=True,
                                    xtitle="Original error rate in %", ytitle="Corrected error rate in %")

fig_error

### Recall & Precision

In [None]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df.dataset == "bacteria"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

In [None]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df.dataset == "yeast"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

In [None]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df.dataset == "metagenome"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

### Precision

#### Simulate

#### Real

### Effect of coverage

One dataset ? br only ?

### Dipolid dataset

## Polish

Effect of polish on miniasm assembly on all dataset mode (with k in range (13, 19, 2) and abundance in range(10, 30))

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p polish_all -n

### Quast result