[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mitiau/DNABERT-Z/blob/main/ZDNA-prediction.ipynb)

## Тут поиск вторичных структур и подготовка генов к выравниванию

# Install dependecies and define helper functions

In [1]:
!pip install transformers
!pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [2]:
import torch
from torch import nn
import transformers
from transformers import BertTokenizer, BertForTokenClassification
import numpy as np
from Bio import SeqIO
from io import StringIO, BytesIO
from google.colab import drive, files
from tqdm import tqdm
import pickle
import scipy
from scipy import ndimage

In [3]:
def seq2kmer(seq, k):
    kmer = [seq[x:x+k] for x in range(len(seq)+1-k)]
    return kmer

def split_seq(seq, length = 512, pad = 16):
    res = []
    for st in range(0, len(seq), length - pad):
        end = min(st+512, len(seq))
        res.append(seq[st:end])
    return res

def stitch_np_seq(np_seqs, pad = 16):
    res = np.array([])
    for seq in np_seqs:
        res = res[:-pad]
        res = np.concatenate([res,seq])
    return res

# Select model and parameters

In [4]:
model = 'HG kouzine' #@param ["HG chipseq", "HG kouzine", "MM chipseq", "MM kouzine"]
model_confidence_threshold = 0.05 #@param {type:"number"}
minimum_sequence_length = 5 #@param {type:"integer"}

In [5]:
if model == 'HG chipseq':
    model_id = '1VAsp8I904y_J0PUhAQqpSlCn1IqfG0FB'
elif model == 'HG kouzine':
    model_id = '1dAeAt5Gu2cadwDhbc7OnenUgDLHlUvkx'
elif model == 'MM curax':
    model_id = '1W6GEgHNoitlB-xXJbLJ_jDW4BF35W1Sd'
elif model == 'MM kouzine':
    model_id = '1dXpQFmheClKXIEoqcZ7kgCwx6hzVCv3H'


In [6]:
!gdown $model_id
!gdown 10sF8Ywktd96HqAL0CwvlZZUUGj05CGk5
!gdown 16bT7HDv71aRwyh3gBUbKwign1mtyLD2d
!gdown 1EE9goZ2JRSD8UTx501q71lGCk-CK3kqG
!gdown 1gZZdtAoDnDiLQqjQfGyuwt268Pe5sXW0


!mkdir 6-new-12w-0
!mv pytorch_model.bin 6-new-12w-0/
!mv config.json 6-new-12w-0/
!mv special_tokens_map.json 6-new-12w-0/
!mv tokenizer_config.json 6-new-12w-0/
!mv vocab.txt 6-new-12w-0/

Downloading...
From (original): https://drive.google.com/uc?id=1dAeAt5Gu2cadwDhbc7OnenUgDLHlUvkx
From (redirected): https://drive.google.com/uc?id=1dAeAt5Gu2cadwDhbc7OnenUgDLHlUvkx&confirm=t&uuid=2b579560-0656-47b7-935f-ab339e860dc5
To: /content/pytorch_model.bin
100% 354M/354M [00:04<00:00, 79.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=10sF8Ywktd96HqAL0CwvlZZUUGj05CGk5
To: /content/config.json
100% 634/634 [00:00<00:00, 2.57MB/s]
Downloading...
From: https://drive.google.com/uc?id=16bT7HDv71aRwyh3gBUbKwign1mtyLD2d
To: /content/special_tokens_map.json
100% 112/112 [00:00<00:00, 438kB/s]
Downloading...
From: https://drive.google.com/uc?id=1EE9goZ2JRSD8UTx501q71lGCk-CK3kqG
To: /content/tokenizer_config.json
100% 40.0/40.0 [00:00<00:00, 157kB/s]
Downloading...
From: https://drive.google.com/uc?id=1gZZdtAoDnDiLQqjQfGyuwt268Pe5sXW0
To: /content/vocab.txt
100% 28.7k/28.7k [00:00<00:00, 51.0MB/s]


In [7]:
tokenizer = BertTokenizer.from_pretrained('6-new-12w-0/')
model = BertForTokenClassification.from_pretrained('6-new-12w-0/')
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4101, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Upload fasta files for prediction

In [8]:
uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

KeyboardInterrupt: 

# Predict and save raw outputs

In [None]:
out = []
for key in uploaded.keys():
    print(key)
    out.append(key)
    result_dict = {}
    for seq_record in SeqIO.parse(StringIO(BytesIO(uploaded[key]).read().decode('UTF-8')), 'fasta'):
        kmer_seq = seq2kmer(str(seq_record.seq).upper(), 6)
        seq_pieces = split_seq(kmer_seq)
        print(seq_record.name)
        out.append(seq_record.name)
        with torch.no_grad():
            preds = []
            for seq_piece in tqdm(seq_pieces):
                input_ids = torch.LongTensor(tokenizer.encode(' '.join(seq_piece), add_special_tokens=False))
                outputs = torch.softmax(model(input_ids.cuda().unsqueeze(0))[-1],axis = -1)[0,:,1]
                preds.append(outputs.cpu().numpy())
        result_dict[seq_record.name] = stitch_np_seq(preds)



        labeled, max_label = scipy.ndimage.label(result_dict[seq_record.name]>model_confidence_threshold)
        print('  start     end')
        out.append('  start     end')
        for label in range(1, max_label+1):
            candidate = np.where(labeled == label)[0]
            candidate_length = candidate.shape[0]
            if candidate_length>minimum_sequence_length:
                print('{:8}'.format(candidate[0]), '{:8}'.format(candidate[-1]))
                out.append('{:8}'.format(candidate[0]) + '{:8}'.format(candidate[-1]))

    with open(key + '.preds.pkl',"wb") as fh:
      pickle.dump(result_dict, fh)
    print()

with open('text_predictions.txt',"w") as fh:
    for item in out:
        fh.write("%s\n" % item)


# Download text file with predictions

In [None]:
files.download('text_predictions.txt')

# Download raw prediction files in numpy format

In [None]:
for key in uploaded.keys():
    files.download(key + '.preds.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
uploaded.keys()

NameError: name 'uploaded' is not defined

## Скачиваем несколько необходимых файлов

In [9]:
!gdown 1tvJDi-DYx3NEescA7AB1tfwTMMMIk90Z
!gdown 1bhf5HhzDmFzS01hfwFPOHhzjl2KX20zP
!gdown 1P-jBUfISSKbnaixTY6ehic4r-w-PCO5Z

Downloading...
From (original): https://drive.google.com/uc?id=1tvJDi-DYx3NEescA7AB1tfwTMMMIk90Z
From (redirected): https://drive.google.com/uc?id=1tvJDi-DYx3NEescA7AB1tfwTMMMIk90Z&confirm=t&uuid=86eb98d4-7bde-409d-924f-a62859c1f4e0
To: /content/GCA_903995115.1_Adeanei_nanopore_chromosomes_genomic.fna.Z-SCORE
100% 1.90G/1.90G [00:32<00:00, 57.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1bhf5HhzDmFzS01hfwFPOHhzjl2KX20zP
To: /content/text_predictions.txt
100% 252k/252k [00:00<00:00, 122MB/s]
Downloading...
From: https://drive.google.com/uc?id=1P-jBUfISSKbnaixTY6ehic4r-w-PCO5Z
To: /content/GCA_903995115.1_Adeanei_nanopore_chromosomes_genomic.fna
100% 21.2M/21.2M [00:00<00:00, 23.0MB/s]


Далее поиск квадруплексов

In [24]:
import re
from Bio import SeqIO

mas = []

pattern="(?:G{3,}[ATGC]{1,7}){3,}G{3,}"
pattern_minus = "(?:C{3,}[ATGC]{1,7}){3,}C{3,}"
for record in SeqIO.parse("GCA_905183005.1_Paramecium_pentaurelia_V1_genomic.fna",'fasta'):
  for m in re.finditer(pattern, str(record.seq),re.IGNORECASE):
    mas.append([record.id, m.start(),m.end(),m.group(0)])
  for m in re.finditer(pattern_minus, str(record.seq),re.IGNORECASE):
    mas.append([record.id, m.start(),m.end(),m.group(0)])

with open("pqs.bed", "w") as f:
  for i in mas:
    f.write(f"{i[0]}\t{i[1]}\t{i[2]}\n")

## Скачиваем bedtools

In [10]:
!wget github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools-2.30.0.tar.gz
!tar -zxvf bedtools-2.30.0.tar.gz

--2024-06-16 12:41:16--  http://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools-2.30.0.tar.gz
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools-2.30.0.tar.gz [following]
--2024-06-16 12:41:16--  https://github.com/arq5x/bedtools2/releases/download/v2.30.0/bedtools-2.30.0.tar.gz
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/15059334/aecc9080-5d79-11eb-847e-0ff72e8f556a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240616%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240616T124116Z&X-Amz-Expires=300&X-Amz-Signature=516c94cb26db62daf31e90279a135f5b3e57c5a99c87972e4b4930d8b9c

In [11]:
!cd bedtools2; make

Building BEDTools:
CXXFLAGS is [-g -Wall -O2 -std=c++11 ]
DETECTED_VERSION = v2.30.0
CURRENT_VERSION  = 
Updating version file.
  * compiling src/bedtools.cpp
  * compiling src/annotateBed/annotateMain.cpp
  * compiling src/annotateBed/annotateBed.cpp
  * compiling src/bamToBed/bamToBed.cpp
[01m[Ksrc/bamToBed/bamToBed.cpp:[m[K In function ‘[01m[Kint bamtobed_main(int, char**)[m[K’:
   88 |     bool [01;35m[KuseAlignmentScore[m[K = false;
      |          [01;35m[K^~~~~~~~~~~~~~~~~[m[K
In file included from [01m[Ksrc/utils/BamTools/include/api/BamReader.h:1[m[K,
                 from [01m[Ksrc/bamToBed/bamToBed.cpp:12[m[K:
[01m[Ksrc/utils/BamTools/include/SamHeader.hpp:[m[K In function ‘[01m[Kint htslib_future::sam_hdr_rebuild(bam_hdr_t*)[m[K’:
   32 |                 [01;35m[Ksam_hdr_write(fp, hdr)[m[K;
      |                 [01;35m[K~~~~~~~~~~~~~^~~~~~~~~[m[K
  * compiling src/bamToFastq/bamToFastqMain.cpp
[01m[Ksrc/bamToFastq/bamToFastqMain

In [12]:
!cp /content/bedtools2/bin/* /usr/local/bin/

In [13]:
! wget https://github.com/bedops/bedops/releases/download/v2.4.41/bedops_linux_x86_64-v2.4.41.tar.bz2
! tar jxvf bedops_linux_x86_64-v2.4.41.tar.bz2
! cp bin/* /usr/local/bin

--2024-06-16 12:45:35--  https://github.com/bedops/bedops/releases/download/v2.4.41/bedops_linux_x86_64-v2.4.41.tar.bz2
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/12932856/7baae005-767f-4700-bd69-68f44f9a01bf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240616%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240616T124535Z&X-Amz-Expires=300&X-Amz-Signature=bbc0793aec22ffe9cfe305fd00cebc9fc63d417ca5cac0ad9097aa1ae3d551a9&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=12932856&response-content-disposition=attachment%3B%20filename%3Dbedops_linux_x86_64-v2.4.41.tar.bz2&response-content-type=application%2Foctet-stream [following]
--2024-06-16 12:45:35--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/12932856/7baa

С .gff неудобно работать, потому переделаем его в .bed

In [26]:
!sortBed -i genomic.gff | gff2bed --do-not-sort > genomic.bed

Error: The requested file (genomic.gff) could not be opened. Error message: (No such file or directory). Exiting!


Тоже самое сделаем с выводом нейросети

In [15]:
import re

gene = ""
with open ("text_predictions.txt", "r") as f:
  with open("bed_predictions1.bed", "w") as o:
    while s := f.readline():
      if "NW" in s:
        gene = s.rstrip()
      elif ("GCF" in s) or ("start" in s):
        pass
      else:
        mas = re.findall(r'\d+', s)
        o.write(f"{gene}\t{mas[0]}\t{mas[1]}\n")

Теперь можно прочесть файл

In [25]:
import pandas as pd

genome = pd.read_csv("genomic.bed", names=["chrom", "start", "end", "score1", "score2", "strand", "lab", "type", "phase", "attributes"], sep="\t")

genome

Unnamed: 0,chrom,start,end,score1,score2,strand,lab,type,phase,attributes


Квадруплексы надо в пандас для удобства

In [27]:
pqs = pd.read_csv("pqs.bed", names=["chrom", "start", "end"], sep="\t")

pqs

Unnamed: 0,chrom,start,end
0,CAJJDO010000002.1,534196,534314
1,CAJJDO010000009.1,519367,519397
2,CAJJDO010000010.1,67153,67174
3,CAJJDO010000012.1,289822,289851
4,CAJJDO010000012.1,0,76
...,...,...,...
86,CAJJDO010000222.1,3603,3643
87,CAJJDO010000229.1,0,21
88,CAJJDO010000242.1,8289,8376
89,CAJJDO010000254.1,10069,10096


Вспомогательная функция для поиска межгенных попаданий

In [28]:
def find_between(gen, needle, find_type: str):
  ex1 = None
  ex2 = None
  ned_types = gen.loc[((gen["type"] == find_type) & (gen["chrom"] == needle["chrom"]))]
  for i in range(len(ned_types) - 1):
      if ex1 is None:
        ex1 = ned_types.iloc[[i]]
      else:
        ex2 = ned_types.iloc[[i]]
        if ((ex1["end"].item() <= needle["start"]) and (ex2["start"].item() >= needle["end"])) \
          or ((ex1["start"].item() <= needle["start"]) and (ex2["start"].item() >= needle["end"])) \
          or ((ex1["end"].item() <= needle["start"]) and (ex2["end"].item() >= needle["end"])) \
          or ((ex1["start"].item() <= needle["start"]) and (ex2["end"].item() >= needle["end"])
          ):
            return [ex1, ex2]


        ex1 = ex2
  else:
    return []

Большая функция для поиска вхождений всех структур, а также возвращения генов в промотеры которых попали они

In [29]:
def find_count(genome, pqs):
  searching_genes = {}
  exons_count = 0
  psq_count = 0
  down_count = 0
  intron_count = 0
  intergenic_count = 0

  for i in pqs.iterrows():
    print(i[1])
    exon_pattern = genome.loc[(
      (genome["type"] == "exon") & (genome["chrom"] == i[1]["chrom"]) &
      (
          ((genome["start"] <= i[1]["start"]) & (genome["end"] >= i[1]["end"]))
            | ((genome["start"] <= i[1]["end"]) & (genome["start"] >= i[1]["start"]))
            | ((genome["end"] <= i[1]["end"]) & (genome["start"] <= i[1]["start"]) & (genome["end"] >= i[1]["start"]))
            | ((genome["start"] >= i[1]["start"]) & (genome["end"] <= i[1]["end"]))
        )
    )]

    psq_pattern_plus = genome.loc[(
      (genome["strand"] == "+") & (genome["type"] == "gene") & (genome["chrom"] == i[1]["chrom"]) &
      (
          ((genome["start"] < i[1]["start"]) & (genome["end"] > i[1]["end"]))
              | (((genome["start"] - 1000) <= i[1]["end"]) & ((genome["start"] - 1000) >= i[1]["start"]))
              | (((genome["start"] - 1000) <= i[1]["start"]) & (genome["start"] >= i[1]["start"]))
      )
    )]

    psq_pattern_minus = genome.loc[(
      (genome["strand"] == "-") & (genome["type"] == "gene") & (genome["chrom"] == i[1]["chrom"]) &
      (
          ((genome["start"] < i[1]["start"]) & (genome["end"] > i[1]["end"]))
              | (((genome["end"] + 1000) <= i[1]["end"]) & ((genome["end"] + 1000) >= i[1]["start"]))
              | (((genome["end"] + 1000) >= i[1]["end"]) & (genome["end"] <= i[1]["end"]))
      )
    )]


    down_pattern_plus = genome.loc[(
      (genome["strand"] == "+") & (genome["type"] == "gene") & (genome["chrom"] == i[1]["chrom"]) &
      (
          ((genome["start"] < i[1]["start"]) & (genome["end"] > i[1]["end"]))
              | (((genome["end"] + 200) <= i[1]["end"]) & ((genome["end"] + 200) >= i[1]["start"]))
              | (((genome["end"] + 200) >= i[1]["end"]) & (genome["end"] <= i[1]["end"]))
      )
    )]

    down_pattern_minus = genome.loc[(
      (genome["strand"] == "-") & (genome["type"] == "gene") & (genome["chrom"] == i[1]["chrom"]) &
      (
          ((genome["start"] < i[1]["start"]) & (genome["end"] > i[1]["end"]))
              | (((genome["start"] - 200) <= i[1]["end"]) & ((genome["start"] - 200) >= i[1]["start"]))
              | (((genome["start"] - 200) <= i[1]["start"]) & (genome["start"] >= i[1]["start"]))
      )
    )]

    intron_pattern = find_between(genome, i[1], "exon")
    intergenic_pattern = find_between(genome, i[1], "gene")



    if len(exon_pattern):
      exons_count += 1
    if len(psq_pattern_plus):
      psq_count += 1
      gene = psq_pattern_plus.iloc[[0]]["attributes"].item().split(";")[0]
      if not searching_genes.get(gene):
        searching_genes.update({gene: 1})
    if len(psq_pattern_minus):
      psq_count += 1
      gene = psq_pattern_minus.iloc[[0]]["attributes"].item().split(";")[0]
      if not searching_genes.get(gene):
        searching_genes.update({gene: 1})
    if len(down_pattern_plus):
      down_count += 1
    if len(down_pattern_minus):
      down_count += 1
    if len(intron_pattern):
      intron_count += 1
    if len(intergenic_pattern):
      intergenic_count += 1



  print(exons_count, psq_count, down_count, intron_count, intergenic_count)

  return [exons_count, psq_count, down_count, intron_count, intergenic_count], searching_genes

Вызов для данных нейросети

In [30]:
import pandas as pd

Результаты

In [31]:
zdna = pd.read_csv("bed_predictions1.bed", names=["chrom", "start", "end"], sep="\t")

zdna_counts, zdna_genes = find_count(genome, zdna)


chrom            NaN
start    905183005.0
end              1.0
Name: 0, dtype: float64
chrom           NaN
start    10000001.0
end             1.0
Name: 1, dtype: float64
chrom           NaN
start    10000002.0
end             1.0
Name: 2, dtype: float64
chrom           NaN
start    10000003.0
end             1.0
Name: 3, dtype: float64
chrom           NaN
start    10000004.0
end             1.0
Name: 4, dtype: float64
chrom           NaN
start    10000005.0
end             1.0
Name: 5, dtype: float64
chrom           NaN
start    10000006.0
end             1.0
Name: 6, dtype: float64
chrom           NaN
start    10000007.0
end             1.0
Name: 7, dtype: float64
chrom           NaN
start    10000008.0
end             1.0
Name: 8, dtype: float64
chrom           NaN
start    10000009.0
end             1.0
Name: 9, dtype: float64
chrom           NaN
start    10000010.0
end             1.0
Name: 10, dtype: float64
chrom           NaN
start    10000011.0
end             1.0
Name: 11, dt

In [32]:
zdna_counts, zdna_genes

([0, 0, 0, 0, 0], {})

Тоже самое для квадруплексов

In [33]:
pqs_counts, pqs_genes = find_count(genome, pqs)

chrom    CAJJDO010000002.1
start               534196
end                 534314
Name: 0, dtype: object
chrom    CAJJDO010000009.1
start               519367
end                 519397
Name: 1, dtype: object
chrom    CAJJDO010000010.1
start                67153
end                  67174
Name: 2, dtype: object
chrom    CAJJDO010000012.1
start               289822
end                 289851
Name: 3, dtype: object
chrom    CAJJDO010000012.1
start                    0
end                     76
Name: 4, dtype: object
chrom    CAJJDO010000016.1
start               354394
end                 354442
Name: 5, dtype: object
chrom    CAJJDO010000020.1
start               327022
end                 327079
Name: 6, dtype: object
chrom    CAJJDO010000021.1
start               754529
end                 754592
Name: 7, dtype: object
chrom    CAJJDO010000021.1
start               755057
end                 755083
Name: 8, dtype: object
chrom    CAJJDO010000022.1
start                34858
end       

In [34]:
pqs_counts, pqs_genes

([0, 0, 0, 0, 0], {})

## Zhunt сделал в другом блокноте

Теперь получим наш вывод hmmer и переведём его в pandas

In [37]:
from collections import defaultdict
from Bio import SearchIO

filename = 'filtered.out'

attribs = ['accession', 'bias', 'bitscore', 'description', 'cluster_num', 'domain_exp_num',  'domain_included_num', 'domain_obs_num', 'domain_reported_num', 'env_num', 'evalue', 'id', 'query_id', 'overlap_num', 'region_num']

hits = defaultdict(list)

with open(filename) as handle:
    for queryresult in SearchIO.parse(handle, 'hmmer3-tab'):
      for hit in queryresult.hits:
        for attrib in attribs:
          hits[attrib].append(getattr(hit, attrib))

hmmer_data = pd.DataFrame.from_dict(hits)

hmmer_data

Unnamed: 0,accession,bias,bitscore,description,cluster_num,domain_exp_num,domain_included_num,domain_obs_num,domain_reported_num,env_num,evalue,id,query_id,overlap_num,region_num
0,-,620.5,-330.2,Paramecium pentaurelia isolate 4_5 large subun...,1,21.9,0,9,9,9,1.0,KF287687.1,Bacteriocin_IIc,8,1
1,-,298.3,-261.6,Paramecium pentaurelia isolate 4_5 large subun...,2,8.7,0,11,11,11,1.0,KF287687.1,BRINP,7,2
2,-,295.2,-189.2,Paramecium pentaurelia isolate 4_5 large subun...,1,12.1,0,3,3,3,1.0,KF287687.1,Dehydrin,1,1
3,-,416.7,-308.2,Paramecium pentaurelia isolate 4_5 large subun...,4,13.5,0,13,13,13,1.0,KF287687.1,DUF3482,8,5
4,-,245.6,-224.3,Paramecium pentaurelia isolate 4_5 large subun...,4,7.6,0,11,11,11,1.0,KF287687.1,DUF4616,3,6
5,-,340.9,-161.5,Paramecium pentaurelia isolate 4_5 large subun...,1,7.0,0,6,6,6,1.0,KF287687.1,DUF5585,4,1
6,-,336.1,-252.8,Paramecium pentaurelia isolate 4_5 large subun...,1,11.4,0,12,12,12,1.0,KF287687.1,DUF726,11,1
7,-,265.9,-192.5,Paramecium pentaurelia isolate 4_5 large subun...,2,10.3,0,11,11,11,1.0,KF287687.1,FSA_C,8,2
8,-,168.6,-152.0,Paramecium pentaurelia isolate 4_5 large subun...,0,4.5,0,6,6,6,1.0,KF287687.1,GAGA_bind,0,6
9,-,380.4,-217.2,Paramecium pentaurelia isolate 4_5 large subun...,1,13.7,0,12,12,12,1.0,KF287687.1,Herpes_capsid,11,1
