# Анализ данных NGS. Домашнее задание № 5

Выполнил: Олег Вавулов

In [1]:
!ls ../../data/hw_5

ECOLI_IS220_QUAKE_1K_paired_reads.fasta s_6.first100000.fastq.gz
ECOLI_IS220_QUAKE_1K_single_reads.fasta test1.fasta
s_6.first1000.fastq.gz                  test2.fasta
s_6.first10000.fastq.gz


In [63]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

import os
from debruijn import *

DATA_PATH = "../../data/hw_5"
KMER_LEN = 55
COV_CUTOFF = 50
LEN_CUTOFF = 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Что исправить

- (DONE) покрытие по k+1 мерам
- (DONE) сжать после удаления хвостиков
- (DONE) ускорить (словари)
- (DONE) посчитать для больших библиотек

# De Bruijn graph

## Library S (1000 reads)

In [56]:
READS_PATH = os.path.join(DATA_PATH, "s_6.first1000.fastq.gz")
RES_PATH = "results_S"

Compressed De Bruijn graph buiding

In [57]:
# de Bruijn graph buiding
adjlist_full, kmer_coverage = build_debruijn_graph(READS_PATH, KMER_LEN)

4108it [00:00, 4790.58it/s]


In [58]:
%%time
# create edges table
edges_full = get_edges(adjlist_full)

CPU times: user 5.35 ms, sys: 71 µs, total: 5.42 ms
Wall time: 5.4 ms


In [117]:
%%time
# compress graph
edges = make_graph_compression(edges_full)

CPU times: user 7.69 s, sys: 9.48 ms, total: 7.7 s
Wall time: 7.7 s


In [25]:
# calculate additional stat on edges
edges = add_edges_statistics(edges, KMER_LEN, kmer_coverage)

In [26]:
# results saving
res_path = os.path.join(RES_PATH, "full")
save_dot(edges, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges, res_path, "edges.fa")

<img src="./results_S/full/graph.png">

Tips removal

In [27]:
# tips removal
edges_notips = remove_tips(
    edges, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [28]:
# results saving
res_path = os.path.join(RES_PATH, "notips")
save_dot(edges_notips, res_path, "graph_notips.dot", "nodes_notips.json")
dot2png(res_path, "graph_notips.dot", "graph_notips.png")
edges2fasta(edges_notips, res_path, "edges_notips.fa")

<img src="results_S/notips/graph_notips.png">

Any unreliable edges removal

In [30]:
# tips removal
edges_noany = remove_any(
    edges, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [31]:
# results saving
res_path = os.path.join(RES_PATH, "noany")
save_dot(edges_noany, res_path, "graph_noany.dot", "nodes_noany.json")
dot2png(res_path, "graph_noany.dot", "graph_noany.png")
edges2fasta(edges_noany, res_path, "edges_noany.fa")

<img src="results_S/noany/graph_noany.png">

При обеих стратегиях очистки графа получаем один и тот же результат, однако способ с удалением хвостиков более аккуратный, так как при его использовании гарантированно сохраняется число компонент связности в графе.

Final compression

In [32]:
%%time
# compress graph
edges_final = make_graph_compression(edges_notips.iloc[:, :3])

CPU times: user 30.9 ms, sys: 1.75 ms, total: 32.7 ms
Wall time: 31.8 ms


In [33]:
# calculate additional stat on edges
edges_final = add_edges_statistics(edges_final, KMER_LEN, kmer_coverage)

In [34]:
# results saving
res_path = os.path.join(RES_PATH, "final")
save_dot(edges_final, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges_final, res_path, "edges.fa")

<img src="results_S/final/graph.png">

## Library M (10,000 reads)

In [119]:
READS_PATH = os.path.join(DATA_PATH, "s_6.first10000.fastq.gz")
RES_PATH = "results_M"

Compressed De Bruijn graph buiding

In [120]:
# de Bruijn graph buiding
adjlist_full, kmer_coverage = build_debruijn_graph(READS_PATH, KMER_LEN)

56320it [00:14, 3994.31it/s]


In [121]:
%%time
# create edges table
edges_full = get_edges(adjlist_full)

CPU times: user 35.1 ms, sys: 1.16 ms, total: 36.3 ms
Wall time: 35.6 ms


In [122]:
%%time
# compress graph
edges = make_graph_compression(edges_full)

CPU times: user 2min 49s, sys: 246 ms, total: 2min 49s
Wall time: 2min 49s


In [123]:
# calculate additional stat on edges
edges = add_edges_statistics(edges, KMER_LEN, kmer_coverage)

In [124]:
# results saving
res_path = os.path.join(RES_PATH, "full")
save_dot(edges, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges, res_path, "edges.fa")

<img src="results_M/full/graph.png">

Tips removal

In [125]:
# tips removal
edges_notips = remove_tips(
    edges, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [126]:
# results saving
res_path = os.path.join(RES_PATH, "notips")
save_dot(edges_notips, res_path, "graph_notips.dot", "nodes_notips.json")
dot2png(res_path, "graph_notips.dot", "graph_notips.png")
edges2fasta(edges_notips, res_path, "edges_notips.fa")

<img src="results_M/notips/graph_notips.png">

Final compression

In [127]:
%%time
# compress graph
edges_final = make_graph_compression(edges_notips.iloc[:, :3])

CPU times: user 178 ms, sys: 2.83 ms, total: 181 ms
Wall time: 181 ms


In [128]:
# calculate additional stat on edges
edges_final = add_edges_statistics(edges_final, KMER_LEN, kmer_coverage)

In [129]:
# results saving
res_path = os.path.join(RES_PATH, "final")
save_dot(edges_final, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges_final, res_path, "edges.fa")

<img src="results_M/final/graph.png">

## Library L (100,000 reads)

In [130]:
READS_PATH = os.path.join(DATA_PATH, "s_6.first100000.fastq.gz")
RES_PATH = "results_L"

Compressed De Bruijn graph buiding

In [131]:
# de Bruijn graph buiding
adjlist_full, kmer_coverage = build_debruijn_graph(READS_PATH, KMER_LEN)

612348it [02:45, 3710.69it/s]


In [132]:
%%time
# create edges table
edges_full = get_edges(adjlist_full)

CPU times: user 566 ms, sys: 9.85 ms, total: 576 ms
Wall time: 574 ms


In [141]:
%%time
# compress graph
edges = make_graph_compression(edges_full)

100%|██████████| 203240/203240 [5:00:29<00:00, 11.27it/s]  


CPU times: user 4h 58min, sys: 2min 40s, total: 5h 40s
Wall time: 5h 30s


In [143]:
# calculate additional stat on edges
edges = add_edges_statistics(edges, KMER_LEN, kmer_coverage)

In [144]:
# results saving
res_path = os.path.join(RES_PATH, "full")
save_dot(edges, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges, res_path, "edges.fa")

Tips removal

In [145]:
# tips removal
edges_notips = remove_tips(
    edges, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [147]:
# results saving
res_path = os.path.join(RES_PATH, "notips")
save_dot(edges_notips, res_path, "graph_notips.dot", "nodes_notips.json")
dot2png(res_path, "graph_notips.dot", "graph_notips.png")
edges2fasta(edges_notips, res_path, "edges_notips.fa")

Final compression

In [161]:
%%time
# compress graph
edges_final = make_graph_compression(edges_notips.iloc[:, :3])

0it [00:00, ?it/s]

CPU times: user 5.81 ms, sys: 1.56 ms, total: 7.37 ms
Wall time: 6.49 ms





In [162]:
# calculate additional stat on edges
edges_final = add_edges_statistics(edges_final, KMER_LEN, kmer_coverage)

In [160]:
# tips removal
edges_notips = remove_tips(
    edges_final, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [156]:
# tips removal
edges_notips = remove_any(
    edges_final, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [159]:
# results saving
res_path = os.path.join(RES_PATH, "final")
save_dot(edges_final, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges_final, res_path, "edges.fa")

<img src="results_L/final/graph.png">