# Анализ данных NGS. Домашнее задание № 5

Выполнил: Олег Вавулов

In [1]:
!ls ../../data/hw_5

ECOLI_IS220_QUAKE_1K_paired_reads.fasta s_6.first100000.fastq.gz
ECOLI_IS220_QUAKE_1K_single_reads.fasta test1.fasta
s_6.first1000.fastq.gz                  test2.fasta
s_6.first10000.fastq.gz


In [2]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

import os
from debruijn import *

DATA_PATH = "../../data/hw_5"
KMER_LEN = 55
COV_CUTOFF = 50
LEN_CUTOFF = 50

# Что исправить

- (DONE) покрытие по k+1 мерам
- (DONE) сжать после удаления хвостиков
- (DONE) ускорить (словари)
- (DONE) посчитать для больших библиотек

# De Bruijn graph

## Library S (1000 reads)

In [3]:
READS_PATH = os.path.join(DATA_PATH, "s_6.first1000.fastq.gz")
RES_PATH = "results_S"

Compressed De Bruijn graph buiding

In [4]:
# de Bruijn graph buiding
adj_matrix_full, kmer_coverage = build_debruijn_graph(READS_PATH, KMER_LEN)

4108it [00:00, 4624.56it/s]


In [5]:
%%time
# create edges table
edges_full = get_edges(adj_matrix_full)

CPU times: user 20.5 ms, sys: 816 µs, total: 21.3 ms
Wall time: 20.5 ms


In [15]:
%%time
# compress graph
edges = make_graph_compression(edges_full)

CPU times: user 9.39 s, sys: 15.4 ms, total: 9.41 s
Wall time: 9.41 s


In [16]:
# calculate additional stat on edges
edges = add_edges_statistics(edges, KMER_LEN, kmer_coverage)

In [17]:
# results saving
res_path = os.path.join(RES_PATH, "full")
save_dot(edges, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges, res_path, "edges.fa")

<img src="results_S/full/graph.png">

Tips removal

In [33]:
# tips removal
edges_notips = remove_tips(
    edges, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [34]:
# results saving
res_path = os.path.join(RES_PATH, "notips")
save_dot(edges_notips, res_path, "graph_notips.dot", "nodes_notips.json")
dot2png(res_path, "graph_notips.dot", "graph_notips.png")
edges2fasta(edges_notips, res_path, "edges_notips.fa")

<img src="results_S/notips/graph_notips.png">

Any unreliable edges removal

In [35]:
# tips removal
edges_noany = remove_any(
    edges, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [36]:
# results saving
res_path = os.path.join(RES_PATH, "noany")
save_dot(edges_noany, res_path, "graph_noany.dot", "nodes_noany.json")
dot2png(res_path, "graph_noany.dot", "graph_noany.png")
edges2fasta(edges_noany, res_path, "edges_noany.fa")

<img src="results_S/noany/graph_noany.png">

При обеих стратегиях очистки графа получаем один и тот же результат, однако способ с удалением хвостиков более аккуратный, так как при его использовании гарантированно сохраняется число компонент связности в графе.

Final compression

In [37]:
%%time
# compress graph
edges_final = make_graph_compression(edges_notips.iloc[:, :3])

CPU times: user 30.6 ms, sys: 2.32 ms, total: 32.9 ms
Wall time: 31.7 ms


In [38]:
# calculate additional stat on edges
edges_final = add_edges_statistics(edges_final, KMER_LEN, kmer_coverage)

In [39]:
# results saving
res_path = os.path.join(RES_PATH, "final")
save_dot(edges_final, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges_final, res_path, "edges.fa")

<img src="results_S/final/graph.png">

## Library M (10,000 reads)

In [40]:
READS_PATH = os.path.join(DATA_PATH, "s_6.first10000.fastq.gz")
RES_PATH = "results_M"

Compressed De Bruijn graph buiding

In [41]:
# de Bruijn graph buiding
adj_matrix_full, kmer_coverage = build_debruijn_graph(READS_PATH, KMER_LEN)

56320it [00:13, 4063.40it/s]


In [42]:
%%time
# create edges table
edges_full = get_edges(adj_matrix_full)

CPU times: user 1.44 s, sys: 132 ms, total: 1.57 s
Wall time: 1.57 s


In [43]:
%%time
# compress graph
edges = make_graph_compression(edges_full)

CPU times: user 3min 11s, sys: 428 ms, total: 3min 11s
Wall time: 3min 12s


In [44]:
# calculate additional stat on edges
edges = add_edges_statistics(edges, KMER_LEN, kmer_coverage)

In [45]:
# results saving
res_path = os.path.join(RES_PATH, "full")
save_dot(edges, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges, res_path, "edges.fa")

<img src="results_M/full/graph.png">

Tips removal

In [46]:
# tips removal
edges_notips = remove_tips(
    edges, len_cutoff=LEN_CUTOFF, cov_cutoff=COV_CUTOFF
)

In [47]:
# results saving
res_path = os.path.join(RES_PATH, "notips")
save_dot(edges_notips, res_path, "graph_notips.dot", "nodes_notips.json")
dot2png(res_path, "graph_notips.dot", "graph_notips.png")
edges2fasta(edges_notips, res_path, "edges_notips.fa")

<img src="results_M/notips/graph_notips.png">

Final compression

In [49]:
%%time
# compress graph
edges_final = make_graph_compression(edges_notips.iloc[:, :3])

CPU times: user 216 ms, sys: 3.11 ms, total: 219 ms
Wall time: 218 ms


In [50]:
# calculate additional stat on edges
edges_final = add_edges_statistics(edges_final, KMER_LEN, kmer_coverage)

In [51]:
# results saving
res_path = os.path.join(RES_PATH, "final")
save_dot(edges_final, res_path, "graph.dot", "nodes.json")
dot2png(res_path, "graph.dot", "graph.png")
edges2fasta(edges_final, res_path, "edges.fa")

<img src="results_M/final/graph.png">