In [30]:
# Some setup
%load_ext autoreload

%autoreload 2

#import altair
#altair.data_transformers.disable_max_rows()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


DataTransformerRegistry.enable('default')

# Code name: Cocktail

A kmer tools box to use small kmer on long-reads.

- count kmer: pcon
- filter reads: kmerf
- correct reads: br
- polish assemblies: br

## Parameter

Some global variable usefull in this notebook don't hesitate to set it 

In [None]:
SNAKEMAKE_CORE=8

## Dataset

### References

| dataset         | species            | info                          | genome size | 
|:----------------|:-------------------|:------------------------------|------------:|
| bacteria        | E. coli str. K-12 substr. MG1655 | NC_000913.3                   |      4.6 Mb |
| yeast           | S. cerevisiae            | NC_001133                     |     12.1Mb |
| metagenome      | Mock community           | [pacbio assembly](https://lomanlab.github.io/mockcommunity/)       |       42.1Mb |



### Reads

| dataset         | info           | # bases (Gb) | coverage
|:----------------|:---------------|-------------:|------------:|
| bacteria        | SRR10950300    |       588    |        126x |
| bacteria7       | TODO           |       591    |        127x |
| yeast           | Genoscope      |       34,474 |        283x |
| metagenome      | [R10 Native “3 Peaks” Data Release (2019-05-24)](https://lomanlab.github.io/mockcommunity/r10.html) |   30,909 | 733x |


## Prepare dataset

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p data_all -n

In [93]:
import cocktail
import cocktail.kmer_spectrum

df = cocktail.kmer_spectrum.get_kmer_spectrum("yeast", 17)

In [98]:
df2=df
df2 = df2[df2["type"] != "all"]
df2 = df2[df2["type"] != "false"]
cocktail.kmer_spectrum.figure(df2)

## Count

Compare pcon against kmc (3.1.1) and jellyfish (2.2.10) k in range(13, 19, 2)), on all dataset.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p count_all -n

### Benchmark

In [13]:
import cocktail
import cocktail.count
import cocktail.utils

df = cocktail.count.dataframe()
df = df[df["dataset"] == "bacteria"]
df = df[df["kmer_size"] != "19"]
print(df)

cocktail.utils.group_scatter(df, "time", "memory", "counter", "kmer_size")

      counter   dataset kmer_size      time   memory
0        pcon  bacteria        13    3.5688    90.61
1        pcon  bacteria        15    7.5931   568.12
2        pcon  bacteria        17  145.6220  8251.18
12        kmc  bacteria        13   17.3429  4826.88
13        kmc  bacteria        15   19.2527  6063.93
14        kmc  bacteria        17   21.6563  5828.07
21  jellyfish  bacteria        13   22.7720   691.17
22  jellyfish  bacteria        15   11.3421  1970.44


In [5]:
import cocktail
import cocktail.count
import cocktail.utils

df = cocktail.count.dataframe()
df = df[df["dataset"] == "yeast"]
print(df)

cocktail.utils.group_scatter(df, "time", "memory", "counter", "kmer_size")

      counter dataset kmer_size       time     memory
4        pcon   yeast        13    12.1763     128.81
5        pcon   yeast        15    25.2525     611.68
6        pcon   yeast        17   159.7308    8291.47
7        pcon   yeast        19  1303.5221  131165.46
16        kmc   yeast        13    33.0769    7606.12
17        kmc   yeast        15    43.6145    6977.93
18        kmc   yeast        17    79.4757   15930.41
19        kmc   yeast        19    49.8536   11481.88
23  jellyfish   yeast        13   132.5846    1780.30
24  jellyfish   yeast        15   250.9994   22117.92


In [6]:
import cocktail
import cocktail.count
import cocktail.utils

df = cocktail.count.dataframe()
df = df[df["dataset"] == "metagenome"]
print(df)

cocktail.utils.group_scatter(df, "time", "memory", "counter", "kmer_size")

      counter     dataset kmer_size       time     memory
8        pcon  metagenome        13    83.5108      80.40
9        pcon  metagenome        15   171.9505     560.27
10       pcon  metagenome        17   289.7087    8240.64
11       pcon  metagenome        19  1475.4770  131120.71
20        kmc  metagenome        13   173.3895    7849.75
25  jellyfish  metagenome        13   827.2320    1780.29


## Filter

Evaluate effect of kmerf on dataset and on assembly. With different k and different ratio.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p filter_all -n

### Memory usage

### CPU time

### Read quality

In [76]:
import pandas
import cocktail
from cocktail.filter import figure

#df = cocktail.filter.dataframe()
#df = df[(df["dataset"] == "bacteria") & (df["tool"] == "kmrf_k15_r90")]

data = cocktail.filter.get_data("bacteria5.tsv", "bacteria", "raw")
df = pandas.DataFrame(data, columns=['dataset', 'tool', 'name', 'length', 'identity'])
df
max = df.quantile(0.995).length
df = df.loc[df['length']<max]
figure(df)

In [25]:
data = cocktail.filter.get_data("bacteria7_60.tsv", "bacteria", "raw")
df = pandas.DataFrame(data, columns=['dataset', 'tool', 'name', 'length', 'identity'])
max = df.quantile(0.998).length
df = df.loc[df['length']<max]
figure(df)

### Assembly quality

## Correction

Compare br (with k in range(13, 19, 2)) against canu correction module (2.0), consent and necat on all dataset.

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p correct_all -n

### Benchmark

In [42]:
import cocktail

df = cocktail.correct.dataframe_bench()
df = df[df["dataset"] == "metagenome"]

cocktail.utils.group_scatter(df, "time", "memory", "counter", "kmer_size")

ValueError: cannot convert float NaN to integer

### Error rate

#### Simulate

#### Real

### Recall & Precision

In [39]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df["dataset"] == "bacteria"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

In [59]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df["dataset"] == "yeast"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

ValueError: cannot convert float NaN to integer

In [None]:
import cocktail

df = cocktail.correct.dataframe_pr()
df = df[df["dataset"] == "metagenome"]

cocktail.utils.group_scatter(df, "precision", "recall", "corrector")

### Precision

#### Simulate

#### Real

### Effect of coverage

One dataset ? br only ?

### Dipolid dataset

## Polish

Effect of polish on miniasm assembly on all dataset mode (with k in range (13, 19, 2) and abundance in range(10, 30))

In [None]:
!snakemake --cores $SNAKEMAKE_CORE --use-conda -p polish_all -n

### Quast result