In [10]:
# Some setup
%load_ext autoreload

%autoreload 2

import altair
altair.data_transformers.disable_max_rows()

def custom_theme():
    return {
        'config': {
            'view': {
                'height': 300,
                'width': 400,
            },
            'range': {
                'category': {'scheme':'dark2'}
            }
        }
    }

altair.themes.register('custom_theme', custom_theme)

# enable the newly registered theme
altair.themes.enable('custom_theme')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ThemeRegistry.enable('custom_theme')

# Code name: Cocktail

A kmer tools box to use small kmer on long-reads.

- count kmer: pcon
- filter reads: kmerf
- correct reads: br
- polish assemblies: br

## Parameter

Some global variable usefull in this notebook don't hesitate to set it 

In [None]:
SNAKEMAKE_CORE=8

## Dataset

### References

| dataset         | species            | info                          | genome size | 
|:----------------|:-------------------|:------------------------------|------------:|
| bacteria        | E. coli str. K-12 substr. MG1655 | NC_000913.3                   |      4.6 Mb |
| yeast           | S. cerevisiae            | NC_001133                     |     12.1Mb |
| metagenome      | Mock community           | [pacbio assembly](https://lomanlab.github.io/mockcommunity/)       |       42.1Mb |



### Reads

| dataset         | info           | # bases (Gb) | coverage    | error rate |
|:----------------|:---------------|-------------:|------------:|-----------:|
| bacteria        | SRR10950300    |       588    |        126x |       14 % |
| bacteria5       | TODO           |       250    |         54x |        6 % |
| bacteria7       | TODO           |       591    |        127x |        7 % |
| yeast           | Genoscope      |       34,474 |        283x |        8 % |
| metagenome      | [R10 Native “3 Peaks” Data Release (2019-05-24)](https://lomanlab.github.io/mockcommunity/r10.html) |   30,909 | 733x | 10 %|


## Prepare dataset

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p data_all -n

In [None]:
import cocktail
import cocktail.kmer_spectrum

df = cocktail.kmer_spectrum.get_kmer_spectrum("bacteria5", 15)

In [None]:
df2 = df
df2 = df2[df2.count != 0]
df2 = df2[df2.type != "all"]
df2 = df2.replace("true", "reference")
df2 = df2.replace("false", "erroneous")

cocktail.kmer_spectrum.figure(df2)

## Count

Compare pcon against kmc (3.1.1) and jellyfish (2.2.10) k in range(13, 19, 2)), on all dataset.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p count_all -n

### Benchmark

In [22]:
import cocktail
import cocktail.count
import cocktail.utils

df = cocktail.count.dataframe()
df = df.dropna()
print(df)

all_fig = list()
for dataset in set(df.dataset):
    all_fig.append(cocktail.utils.group_scatter(df[df.dataset == dataset], "time", "memory", color="counter", shape="kmer_size", title=dataset))

figs = cocktail.utils.fig_layout(all_fig, 4)
#df = df[df.counter != "jellyfish"]
#df = df[df.kmer_size == "19"]
#fig = cocktail.utils.group_scatter(df, "size", "time", color="counter")

#print(df.to_csv("tmp.csv"))
#line = fig.transform_regression('size', 'time', groupby=["counter"])
#reg = fig.transform_regression('size', 'time', groupby=["counter:N"], params=True)

#print(reg.encoding.to_dict())

#(fig + line.mark_line()) + reg
figs

          counter     dataset kmer_size      time     memory         size
0            pcon    bacteria        13   23.2677     173.10    591394705
1            pcon    bacteria        17   27.8702    8360.38    591394705
2            pcon    bacteria        19  150.7886  128931.77    591394705
3            pcon  metagenome        13  437.4146     835.82  32125836484
4            pcon  metagenome        15  164.6621    1288.88  32125836484
..            ...         ...       ...       ...        ...          ...
131  jellyfish_cd   bacteria7        19  163.3012   39333.71    606228083
132  jellyfish_cd   bacteria5        13   37.9216     125.75    253770734
133  jellyfish_cd   bacteria5        15   40.2693    1966.72    253770734
134  jellyfish_cd   bacteria5        17   96.7943   31220.48    253770734
135  jellyfish_cd   bacteria5        19   72.1206   39334.44    253770734

[136 rows x 6 columns]
0 4
start line alt.Chart(...)
continue line alt.HConcatChart(...)
continue line alt.HCon

## Filter

Evaluate effect of kmerf on dataset and on assembly. With different k and different ratio.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p filter_all -n

### Benchmark

In [None]:
import cocktail
import cocktail.utils
import cocktail.filter

df = cocktail.filter.dataframe_bench()

all_fig = list()
for dataset in set(df.dataset):
    all_fig.append(cocktail.utils.group_scatter(df[df.dataset == dataset], "time", "memory", "filter", None, dataset))

fig = ((all_fig[0] | all_fig[1]) & (all_fig[2] | all_fig[3])) & all_fig[4]

fig

### Error rate

In [None]:
import cocktail
import cocktail.utils
import cocktail.filter

df = cocktail.filter.dataframe_stats()

df.raw = df.raw * 100
df.corrected = df.corrected * 100

fig_error = cocktail.utils.group_line(df, "raw", "corrected", color="filter", point=True,
                                    xtitle="Original error rate in %", ytitle="Corrected error rate in %")

fig_error

### Read quality

In [None]:
import pandas
import cocktail
import cocktail.filter

df = cocktail.filter.get_id_data("data/yeast/reads.len_id.tsv", "yeast", "raw", "")
cocktail.filter.figure_len_identity(df)

In [None]:
import pandas
import cocktail
import cocktail.filter

df = cocktail.filter.identity_data("yeast", "kmrf", "k15.r70")
cocktail.filter.figure_len_identity(df)

In [None]:
import pandas
import cocktail
import cocktail.filter

cocktail.filter.figure_filtred("yeast", "kmrf", "k15.r70")

### Assembly quality

## Correction

Compare br (with k in range(13, 19, 2)) against canu correction module (2.0), consent and necat on all dataset.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p correct_all -n

### Benchmark

In [None]:
import pandas
import cocktail
import cocktail.correct
import cocktail.utils

df = cocktail.correct.dataframe_bench()

#df = df[df.corrector != "br_k13"]
#df = df[df.corrector != "br_k15"]
#df = df[df.corrector != "br_k17"]

#df = df.replace({"br_k19": "br"})

fig_time = cocktail.utils.group_line(df, "size", "time", color="corrector", point=True,
                                    xtitle="Reads file size in Mb", ytitle="Wall time in second")
fig_mem = cocktail.utils.group_line(df, "size", "memory", color="corrector", point=True,
                                    xtitle="Reads file size in Mb", ytitle="Memory peak in Mb")


fig_time | fig_mem

### Error rate

In [None]:
import pandas
import cocktail
import cocktail.correct
import cocktail.utils

df = cocktail.correct.dataframe_stats()

#df = df[df.corrector != "br_k13"]
#df = df[df.corrector != "br_k15"]
#df = df[df.corrector != "br_k17"]

#df = df.replace({"br_k19": "br"})
df.raw = df.raw * 100
df.corrected = df.corrected * 100

fig_error = cocktail.utils.group_line(df, "raw", "corrected", color="corrector", point=True,
                                    xtitle="Original error rate in %", ytitle="Corrected error rate in %")

fig_error

### Recall & Precision

In [None]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df.dataset == "bacteria"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

In [None]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df.dataset == "yeast"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

In [None]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df.dataset == "metagenome"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

### Precision

#### Simulate

#### Real

### Effect of coverage

One dataset ? br only ?

### Dipolid dataset

## Polish

Effect of polish on miniasm assembly on all dataset mode (with k in range (13, 19, 2) and abundance in range(10, 30))

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p polish_all -n

### Quast result