In [None]:
from utils import * 
from notes import NOTES 
from src.files.blast import BLASTFileGgKbase
import glob
import parasail 
import math
import pyfaidx 

import networkx as nx
from tqdm import tqdm 
%load_ext autoreload 
%autoreload 2

# /groups/banfield/sequences/2014/16FT_4_MIN/raw.d/16ft_4_CZBZ.6237.3.40316_trim_clean.PE.1.fastq.gz

# Nantong_Groundwater_SRR22387873_scaffold_155357 is too low converage and has too many SNPs, not doable. 
# Nantong_Groundwater_SRR22387873_scaffold_346753
# bbmap.sh pigz=t unpigz=t ambiguous=random minid=0.96 idfilter=0.97 threads=64 out=stdout.sam editfilter=5 in1=/groups/banfield/scratch/projects/environmental/RES/int/LDS/sandpiper_not_in_Logan/SRR22387873_trim_clean.PE.1.fastq.gz in2=/groups/banfield/scratch/projects/environmental/RES/int/LDS/sandpiper_not_in_Logan/SRR22387873_trim_clean.PE.2.fastq.gz ref=~/scaffold_346753.ref.fasta nodisk | shrinksam | sambam > ~/scaffold_346753.bam


# makeblastdb -in contigs.fa -dbtype nucl -title contigs

In [2]:
def write_refs(df:pd.DataFrame, seed_contig_id:str='', data_dir='../data/bins/nantong_groundwater/'):
    df.index = [re.search(r'scaffold_.+', contig_id, flags=re.DOTALL).group(0) for contig_id in df.index]
    path = os.path.join(data_dir, f'{seed_contig_id}.ref.fasta')
    FASTAFile.from_df(df).write(path)

In [3]:
CONTIGS_PATH = '../data/databases/nantong_groundwater/contigs.fa'

def fai_get_sequences(contig_ids, path:str=CONTIGS_PATH, strands:list=None):
    '''Read the sequences from the FASTA file.'''
    f = pyfaidx.Fasta(path)
    strands = np.array(['+'] * len(contig_ids)) if (strands is None) else strands
    seqs = [str(f[contig_id]) for contig_id in contig_ids]
    seqs = [reverse_complement(seq) if (strand == '-') else seq for (strand, seq) in zip(strands, seqs)]
    return seqs

In [4]:
def make_genome(input_dir:str, output_dir:str='../data/'):
    contig_id_map = dict()
    genome_id = os.path.basename(input_dir)
    content = ''
    for i, path in enumerate(sorted(glob.glob(os.path.join(input_dir, '*')))):
        with open(path, 'r') as f:
            lines = f.readlines()
            contig_id_map[f'{genome_id}.{i + 1}'] = lines[0].split()[0].replace('>', '')
            lines = [f'>{genome_id}.{i + 1}\n'] + lines[1:]
            content += ''.join(lines)

    output_path = os.path.join(output_dir, f'{genome_id}.genome.fasta')
    with open(output_path, 'w') as f:
        f.write(content)
    return output_path


In [5]:
def load_database(path:str='../data/databases/nantong_groundwater/taxonomy.tsv'):
    # Contig name	Size (bp)	Coverage	GC %	Taxonomy winner	Winner %	Species winner	Species winner %	Genus winner	Genus winner %	Order winner	Order winner %	Class winner	Class winner %	Phylum winner	Phylum winner %	Domain winner	Domain winner %
    levels = ['species', 'genus', 'order', 'class', 'phylum', 'domain']
    cols = ['contig_id', 'size', 'coverage', 'gc_percent', 'taxonomy_winner', 'taxonomy_winner_percent']
    cols = cols + [col for level in levels for col in [f'{level}_winner', f'{level}_winner_percent']]
    database_df = pd.read_csv(path, sep='\t', names=cols, skiprows=1)
    database_df = database_df.fillna('unknown')
    return database_df.set_index('contig_id')

database_df = load_database()

In [6]:
seed_contigs_df = pd.read_csv('../data/seed_contigs.csv', index_col=0)

seed_contig_id = 'Nantong_Groundwater_SRR22387873_scaffold_346753'
query_row = seed_contigs_df.loc[seed_contig_id]
print(seed_contig_id)
print(f'GC percent: {100 * query_row.gc_content:.2f}%')
print(f'Length: {query_row.length} bp')

Nantong_Groundwater_SRR22387873_scaffold_346753
GC percent: 22.06%
Length: 1047 bp


In [7]:
def search_database(contig_id, database_df=database_df, gc_percent_delta:float=3, coverage_delta:float=3, notes=NOTES, exclude_flagged:bool=True):
    '''Search the database_df for contigs with similar coverage and GC content to the seed contig. '''
    query_row = database_df.loc[contig_id]
    mask = (database_df.gc_percent < query_row.gc_percent + gc_percent_delta) & (database_df.gc_percent > query_row.gc_percent - gc_percent_delta)
    mask = mask & ((database_df.coverage < query_row.coverage + coverage_delta) & (database_df.coverage > query_row.coverage - coverage_delta))
    
    results_df = database_df[mask].copy()
    results_df['note'] = results_df.index.map(notes)
    results_df['note'] = results_df.note.fillna('none')
    if exclude_flagged:
        results_df = results_df[~results_df.note.str.contains('X')].copy()
    results_df = results_df[results_df.domain_winner != 'Bacteria'].copy() # Exclude hits with Bacterial taxonomy winner.
    results_df = results_df.sort_values('size', ascending=False)
    results_df = results_df.loc[[contig_id] + [id_ for id_ in results_df.index if (id_ != contig_id)]].copy()
    print(f'search_database: Found {len(results_df)} with similar GC percent and coverage to {contig_id}.')
    print(f"search_database: {results_df['size'].sum() / 1000:.2f} kb in candidate bin.")
    return results_df.copy()


results_df = search_database(seed_contig_id)
results_df['seq'] = fai_get_sequences(results_df.index.values)

search_database: Found 9 with similar GC percent and coverage to Nantong_Groundwater_SRR22387873_scaffold_346753.
search_database: 13.15 kb in candidate bin.


In [8]:
CONTIG_END_LENGTH = 200
DATABASE_PATH = '/home/prichter/Documents/banfield/betazoid/data/databases/nantong_groundwater/contigs.fa'
OUTPUT_DIR = '../data/bins/nantong_groundwater/'
MIN_PERCENT_IDENTITY = 80
MIN_ALIGNMENT_LENGTH = 80


def _filter_hits(blast_df:pd.DataFrame, delta:int=3):
    mask = (blast_df.query_border != blast_df.subject_border) # Get rid of overlaps between left borders and right borders. 
    mask = mask & (((blast_df.query_border == 'L') & (blast_df.query_lbd < delta)) | ((blast_df.query_border == 'R') & (blast_df.query_rbd < delta)))
    mask = mask & (((blast_df.subject_border == 'L') & (blast_df.subject_lbd < delta)) | ((blast_df.subject_border == 'R') & (blast_df.subject_rbd < delta)))
    print(f'_filter_hits: Retaining {mask.sum()} out of {len(blast_df)} hits.')
    blast_df = blast_df[mask].copy()
    return blast_df

# Need some functions to account for merged contigs. 
get_scaffold_numbers = lambda contig_id : contig_id.split('_scaffold_')[-1].split('_')
# has_shared_scaffold = lambda contig_id_1, contig_id_2 : len(np.intersect1d(get_scaffold_numbers(contig_id_1), get_scaffold_numbers(contig_id_2)))
has_shared_scaffold = lambda row : len(np.intersect1d(get_scaffold_numbers(row.query_id), get_scaffold_numbers(row.subject_id))) > 0

def blast_contig_ends(contigs_df:pd.DataFrame, name:str=None, max_n_mismatches:int=2):

    input_path = os.path.join(OUTPUT_DIR, f'{name}.blast.in.fasta')
    output_path = os.path.join(OUTPUT_DIR, f'{name}.blast.out.tsv')
    input_df = [{'seq':row.seq[:CONTIG_END_LENGTH], 'id':f'{row.Index}.L'} for row in contigs_df.itertuples()]
    input_df += [{'seq':row.seq[-CONTIG_END_LENGTH:], 'id':f'{row.Index}.R'} for row in contigs_df.itertuples()]
    input_df = pd.DataFrame(input_df).set_index('id')
    FASTAFile.from_df(input_df).write(input_path)

    fields = 'qseqid sseqid qlen slen length mismatch gapopen qstart qend sstart send' #.split()
    cmd = f'blastn -db {DATABASE_PATH} -query {input_path} -out {output_path} -perc_identity {MIN_PERCENT_IDENTITY} -outfmt "6 {fields}"'
    
    subprocess.run(cmd, shell=True, check=True)
    blast_df = pd.read_csv(output_path, sep='\t', names=fields.split())
    blast_df = blast_df[blast_df.mismatch <= max_n_mismatches].copy()
    # blast_df = blast_df[blast_df.qseqid.str.partition(".")[0] != blast_df.sseqid].copy() # Remove self-alignments.
    blast_df = blast_df[blast_df.length > MIN_ALIGNMENT_LENGTH].copy() 

    # First load the subject sequences and clean up. 
    # Convert from one-indexed inclusive bounds to zero-indexed, upper-exclusive bounds. 

    blast_df['subject_id'] = blast_df.sseqid
    blast_df['subject_strand'] = np.where(blast_df.send < blast_df.sstart, '-', '+')
    blast_df['subject_length'] = blast_df.slen
    blast_df['subject_seq'] = fai_get_sequences(blast_df.subject_id, strands=blast_df.subject_strand.values)
    blast_df['subject_start'] = np.where(blast_df.subject_strand == '+', blast_df.sstart - 1, blast_df.slen - blast_df.sstart)
    blast_df['subject_end'] = np.where(blast_df.subject_strand == '+', blast_df.send, blast_df.slen - blast_df.send + 1)
    blast_df['subject_rbd'] = blast_df.subject_length - blast_df.subject_end
    blast_df['subject_lbd'] = blast_df.subject_start
    blast_df['subject_border'] = np.where(blast_df.subject_lbd < blast_df.subject_rbd, 'L', 'R')

    blast_df['query_id'] = blast_df.qseqid.str.partition(".")[0]
    blast_df['query_seq'] = blast_df.query_id.map(contigs_df.seq)
    blast_df['query_length'] = blast_df.query_seq.apply(len)
    blast_df['query_border'] = [re.search(r'\.(R|L)', id_).group(1) for id_ in blast_df.qseqid]
    blast_df['query_start'] = np.where(blast_df.query_border == 'R', (blast_df.query_length - CONTIG_END_LENGTH) + (blast_df.qstart - 1), blast_df.qstart - 1) # If query match is on the right border, the start of the alignment is (150 - qstart) from the right end. 
    blast_df['query_end'] = np.where((blast_df.query_border == 'R'),  (blast_df.query_length - CONTIG_END_LENGTH) + blast_df.qend, blast_df.qend) # If query match is on the right border, use the end of the alignment is (150 - qend) from the right end. 
    blast_df['query_rbd'] = blast_df.query_length - blast_df.query_end
    blast_df['query_lbd'] = blast_df.query_start

    blast_df = blast_df[~blast_df.apply(has_shared_scaffold, axis=1)].copy() # Remove self-hits.

    blast_df = blast_df.rename(columns={'gapopen':'n_gaps', 'mismatch':'n_mismatches', 'length':'alignment_length'})
    cols = ['query_id', 'subject_id', 'query_length', 'subject_length', 'subject_strand'] 
    cols += ['query_border', 'query_lbd', 'query_rbd', 'subject_border', 'subject_lbd', 'subject_rbd']
    cols += ['query_start', 'query_end', 'subject_start', 'subject_end']
    cols += ['n_gaps', 'n_mismatches', 'alignment_length', 'query_seq', 'subject_seq']
    return blast_df[cols].copy().reset_index()


In [9]:
def merge_contigs(blast_df:pd.DataFrame, database_df:pd.DataFrame=database_df):
    get_scaffold_number = lambda contig_id : contig_id.split('_scaffold_')[-1] #.split('_')
    get_scaffold_prefix = lambda contig_id : contig_id.split('_scaffold_')[0]

    df = list()
    for row in blast_df.itertuples():
        row_ = {'parent_contig_ids':[row.query_id, row.subject_id], 'parent_lengths':[row.query_length, row.subject_length], 'alignment_length':row.alignment_length}
        row_['parent_coverages'] = database_df.loc[[row.query_id, row.subject_id]].coverage.tolist()
        row_['parent_gc_percents'] = database_df.loc[[row.query_id, row.subject_id]].gc_percent.tolist()

        if row.query_border == 'R':
            row_['seq'] = row.query_seq + row.subject_seq[row.subject_end:]
            row_['contig_id'] = get_scaffold_prefix(row.query_id) + '_scaffold_' + get_scaffold_number(row.query_id) + '_' + get_scaffold_number(row.subject_id)
        elif row.query_border == 'L':
            row_['seq'] = row.subject_seq[:row.subject_start] + row.query_seq
            row_['contig_id'] = get_scaffold_prefix(row.query_id) + '_scaffold_' + get_scaffold_number(row.subject_id) + '_' + get_scaffold_number(row.query_id)
        
        L = sum(row_['parent_lengths'])
        row_['coverage'] = np.mean([(l /L)  * gc for l, gc in zip(row_['parent_lengths'], row_['parent_gc_percents'])]) # This will not be exact because of the overlap.
        row_['gc_percent'] = 100 * (row_['seq'].count('G') + row_['seq'].count('C')) / len(row_['seq'])
        df.append(row_)
    df = pd.DataFrame(df)
    df['length'] = df.seq.apply(len)
    
    # print(df.query_length + df.subject_length - df.alignment_length)
    return df.set_index('contig_id', drop=True)


In [10]:
# blast_df = _filter_hits(blast_contig_ends(results_df, name='scaffold_346753'))
# blast_df = blast_contig_ends(results_df, name='scaffold_346753')

In [87]:
fields = 'qseqid sseqid qlen slen length mismatch gapopen qstart qend sstart send pident bitscore evalue' #.split()
input_path = '../data/bins/nantong_groundwater/scaffold_2044.2.gene.fasta'
output_path = '../data/bins/nantong_groundwater/scaffold_2044.2.blast.tsv'
cmd = f'blastn -db {DATABASE_PATH} -query {input_path} -out {output_path} -outfmt "6 {fields}" -perc_identity 10 -evalue 10 -gapopen 5  -gapextend 2 -word_size 7 -penalty -1'
print(cmd)

blast_df = pd.read_csv(output_path, sep='\t', names=fields.split())
blast_df = blast_df[blast_df.sseqid != 'Nantong_Groundwater_SRR22387873_scaffold_2044'].copy() # Remove self-hits.
blast_df = blast_df[blast_df.sseqid != 'Nantong_Groundwater_SRR22387873_scaffold_1350'].copy() # Remove hits to the other known Betazoid.  
blast_df = blast_df[blast_df.slen < 25000].copy() # Exclude contig hits that are too long. 

subject_contig_ids = blast_df.sseqid.unique()
blast_df['subject_seq'] = blast_df.sseqid.map(dict(zip(subject_contig_ids, fai_get_sequences(subject_contig_ids))))

get_gc_percent = lambda seq : 100 * (seq.count('G') + seq.count('C')) / len(seq)
blast_df['subject_gc_percent'] = blast_df.subject_seq.apply(get_gc_percent)
blast_df = blast_df[blast_df.subject_gc_percent < 25].copy()

blast_df['taxonomy_winner'] = blast_df.sseqid.map(database_df.taxonomy_winner)
blast_df['domain_winner'] = blast_df.sseqid.map(database_df.domain_winner)
# blast_df['taxonomy_winner_percent'] = blast_df.sseqid.map(database_df.taxonomy_winner_percent)
# blast_df['taxonomy_winner_percent'] = np.where(blast_df.taxonomy_winner_percent == 'unknown', 0, blast_df.taxonomy_winner_percent).astype(float)
blast_df = blast_df[(blast_df.taxonomy_winner == 'unknown')].copy()

get_scaffold_number = lambda contig_id : int(contig_id.split('_scaffold_')[-1])
blast_df['subject_contig_number'] = blast_df.sseqid.apply(get_scaffold_number)
blast_df = blast_df[blast_df.length > 50].copy()

exclude_scaffold_numbers = [43870, 253652, 370601, 267053, 134067, 30515, 38970, 106191, 3756, 4040, 5622, 9140, 10230, 10553, 11878, 12630,13505, 13849, 13997, 21904, 2414, 82343, 9811, 24147, 24838, 28963, 47738]

blastn -db /home/prichter/Documents/banfield/betazoid/data/databases/nantong_groundwater/contigs.fa -query ../data/bins/nantong_groundwater/scaffold_2044.2.gene.fasta -out ../data/bins/nantong_groundwater/scaffold_2044.2.blast.tsv -outfmt "6 qseqid sseqid qlen slen length mismatch gapopen qstart qend sstart send pident bitscore evalue" -perc_identity 10 -evalue 10 -gapopen 5  -gapextend 2 -word_size 7 -penalty -1


In [89]:
blast_df[(blast_df.bitscore > 40) & (~blast_df.subject_contig_number.isin(exclude_scaffold_numbers))].sort_values('slen').drop_duplicates('sseqid')

# Nantong_Groundwater_SRR22387873_scaffold_130073 looks super promising. 
# Nantong_Groundwater_SRR22387873_scaffold_18414 might also be a good candidate.
# Nantong_Groundwater_SRR22387873_scaffold_119008
# Nantong_Groundwater_SRR22387873_scaffold_90561 maybe another DNA polymerase.
# Nantong_Groundwater_SRR22387873_scaffold_239429 could be another DNA polymerase
# Nantong_Groundwater_SRR22387873_scaffold_300679 is very promising. 
# Nantong_Groundwater_SRR22387873_scaffold_290644 is probably another DNA polymerase.
# Nantong_Groundwater_SRR22387873_scaffold_183905 another DNA polymerase?
# Nantong_Groundwater_SRR22387873_scaffold_241552
blast_df[blast_df.sseqid == 'Nantong_Groundwater_SRR22387873_scaffold_18414']

Unnamed: 0,qseqid,sseqid,qlen,slen,length,mismatch,gapopen,qstart,qend,sstart,send,pident,bitscore,evalue,subject_seq,subject_gc_percent,taxonomy_winner,domain_winner,subject_contig_number
8573,scaffold_2044.2_20,Nantong_Groundwater_SRR22387873_scaffold_18414,1833,4471,65,20,0,1427,1491,4304,4240,69.231,41.2,0.52,CTTGCTTTTACTAATTTTTCGTCAGTAAAAATGATACCATTTCCGT...,24.021472,unknown,unknown,18414


In [12]:
write_refs(results_df, seed_contig_id='scaffold_346753')

In [13]:
# replace_contig_ids = dict()
# replace_contig_ids['Nantong_Groundwater_SRR22387873_scaffold_377775'] = 'Nantong_Groundwater_SRR22387873_scaffold_103075_377775'
# replace_contig_ids['Nantong_Groundwater_SRR22387873_scaffold_358410'] = 'Nantong_Groundwater_SRR22387873_scaffold_153884_358410'
# replace_contig_ids['Nantong_Groundwater_SRR22387873_scaffold_335499'] = 'Nantong_Groundwater_SRR22387873_scaffold_45379_335499_207268'

# for contig_id, new_contig_id in replace_contig_ids.items():
#     results_df.loc[contig_id, 'size'] = new_contigs_df.loc[new_contig_id, 'length']
#     results_df.loc[contig_id, 'gc_percent'] = new_contigs_df.loc[new_contig_id, 'gc_percent']
#     results_df.loc[contig_id, 'coverage'] = new_contigs_df.loc[new_contig_id, 'coverage']
#     results_df.loc[contig_id, 'seq'] = new_contigs_df.loc[new_contig_id, 'seq']
#     results_df.rename(index={contig_id:new_contig_id}, inplace=True)

# FASTAFile.from_df(results_df).write(f"../data/bins/nantong_groundwater/{query_row['id']}.ref.fasta")


In [14]:
results_df.sort_values('size')
results_df['size'].sum()

np.int64(13147)

In [15]:
print(database_df.loc['Nantong_Groundwater_SRR22387873_scaffold_45379']['size'])
print(database_df.loc['Nantong_Groundwater_SRR22387873_scaffold_335499']['size'] + database_df.loc['Nantong_Groundwater_SRR22387873_scaffold_45379']['size'])

print(database_df.loc['Nantong_Groundwater_SRR22387873_scaffold_103075']['size'])


2797
3860
1865


In [16]:
def align_contig_ends(contigs_df, min_percent_identity:float=0.98, min_alignment_length:int=15, length:int=150, gap_open:int=20, gap_extend:int=1):
    contigs_df['reverse_seq'] = [reverse_complement(seq) for seq in contigs_df.seq] 
    contigs_df['seq_l'] = [seq[:length] for seq in contigs_df.seq]
    contigs_df['seq_r'] = [seq[-length:] for seq in contigs_df.seq]
    contigs_df['reverse_seq_l'] = [seq[:length] for seq in contigs_df.reverse_seq]
    contigs_df['reverse_seq_r'] = [seq[-length:] for seq in contigs_df.reverse_seq]

    matrix = parasail.matrix_create('ACGT', 2, -3) # + 2 match score, -3 mismatch score. 
    graph = nx.DiGraph()

    rows = list(contigs_df.itertuples())
    for a, b in tqdm(itertools.product(rows, rows), desc='align_contig_ends'):
        if a.Index == b.Index:
            continue 

        for loc_a, loc_b in [('seq_r', 'seq_l'), ('seq_r', 'reverse_seq_l'), ('reverse_seq_r', 'seq_l')]:
            alignment = parasail.sw_trace_striped_16(getattr(a, loc_a), getattr(b, loc_b), gap_open, gap_extend, matrix) # Smith-Waterman local alignment. 
            n_matches = sum(x == y for x, y in zip(alignment.traceback.query, alignment.traceback.ref) if ((x != '-') and (y != '-')))
            n = sum(1 for x, y in zip(alignment.traceback.query, alignment.traceback.ref) if ((x != '-') and (y != '-')))
            # print(alignment.traceback.comp)
            if ((n_matches / n) > min_percent_identity) and (n > min_alignment_length):
                graph.add_edge(a.Index, b.Index, percent_identity=(n_matches / n), alignment_length=n, alignment=f'{alignment.traceback.query}\n{alignment.traceback.ref}')
                
    return graph 

In [17]:
graph = align_contig_ends(results_df)
# fig, ax = plt.subplots(figsize=(6, 6))
# pos = nx.spring_layout(graph)
# nx.draw_networkx_nodes(graph, pos, ax=ax, node_size=100)
# nx.draw_networkx_edges(graph, pos, ax=ax, width=0.5)

# for _, spine in ax.spines.items():
#     spine.set_visible(False)
# plt.show()

align_contig_ends: 0it [00:00, ?it/s]

align_contig_ends: 81it [00:00, 7327.64it/s]


In [18]:
for u, v, data in graph.edges(data=True):
    print(u, v)
    print(data['alignment'])
    print()

In [19]:
results_df['size'].sum()
len(results_df)

9

In [20]:
results_df.sort_values('size')


Unnamed: 0,size,coverage,gc_percent,taxonomy_winner,taxonomy_winner_percent,species_winner,species_winner_percent,genus_winner,genus_winner_percent,order_winner,...,phylum_winner_percent,domain_winner,domain_winner_percent,note,seq,reverse_seq,seq_l,seq_r,reverse_seq_l,reverse_seq_r
scaffold_346753,1047,23.07,22.06,unknown,unknown,unknown,1.0,unknown,1.0,unknown,...,1.0,unknown,1.0,Seed contig with DNA polymerase B.,TTTATACGAATCTATAAAATATTTTGAGGAAATAATTAATAGTTTA...,ATTTCAAATTTATCAGTTTTATGATTATATCCTTCGATATAATAAG...,TTTATACGAATCTATAAAATATTTTGAGGAAATAATTAATAGTTTA...,TACTGACGCAATAATAACATCCGAAAAATTACCGACAGGAACAGGT...,ATTTCAAATTTATCAGTTTTATGATTATATCCTTCGATATAATAAG...,CCTGTTATTGTTTCACAATTAATTGAAGGAATAAATTTATTTAAAT...
scaffold_338320,1059,24.93,21.72,unknown,unknown,unknown,1.0,unknown,1.0,unknown,...,1.0,unknown,1.0,No annotations.,ATGCACCAACACCTGAAGAAGCATTAAAAAATACATTCACTTTAGT...,CTATTTGTTTTTGAATTAATCCACTTATACTTTCAATTTCTAAACC...,ATGCACCAACACCTGAAGAAGCATTAAAAAATACATTCACTTTAGT...,ATTTTAGTAGTAATTCTTTAGATAAATCAATTTCTTTTACAGATTA...,CTATTTGTTTTTGAATTAATCCACTTATACTTTCAATTTCTAAACC...,CTGAAAGGAGAGGATTAAATTTTGAAATTTTTGCGGTTATAACACC...
scaffold_300679,1119,22.92,23.68,unknown,unknown,unknown,1.0,unknown,1.0,unknown,...,1.0,unknown,1.0,No annotations.,GTAAAAAACATAAAAAATATACAGGTTCACGATATTAAATGAACCC...,CTCATTGCGTAAAGATAATAATATCCGTGTTTAAAAAAAAGATTGT...,GTAAAAAACATAAAAAATATACAGGTTCACGATATTAAATGAACCC...,ATCCTTTTCAATCACAATGCGAATATTCAATAGAAAAAGGATATAA...,CTCATTGCGTAAAGATAATAATATCCGTGTTTAAAAAAAAGATTGT...,AAATTAAAAAAGCTAAAGCTGTCATTGATAAGGTTTTACCTTTACC...
scaffold_290644,1137,22.56,22.69,unknown,unknown,unknown,1.0,unknown,1.0,unknown,...,1.0,unknown,1.0,Has another type B DNA polymerase. Also a Beta...,AAATAGACAGATTAAATTTAGAGATTCATACGCATTAATGCCGAAA...,TGTTTTAATTTTTCGGATGTAATAATAGCATCAGTATCACAATAAG...,AAATAGACAGATTAAATTTAGAGATTCATACGCATTAATGCCGAAA...,AAATCTTGATTCTAATCATATTATTCCATCGTTATCATACAGGGTT...,TGTTTTAATTTTTCGGATGTAATAATAGCATCAGTATCACAATAAG...,TGTCTTAAACACGTAAATATTCTTTTATCAGGATTATCAAAATCAT...
scaffold_267693,1182,22.08,24.2,unknown,unknown,unknown,0.75,unknown,0.75,unknown,...,0.75,unknown,0.75,One hypothetical protein assigned Orange Borg ...,ATAAACCAATATCAGAACTAACTCAGCAAAATACTATTGGTATGCA...,TGTTTGATATGCTTTAATAACAATATTCTCTATTTGCTCTCTGTCA...,ATAAACCAATATCAGAACTAACTCAGCAAAATACTATTGGTATGCA...,GATAAAGAGACGAAAATAGATGAAAATCTTAAAGAGAAAATGGATA...,TGTTTGATATGCTTTAATAACAATATTCTCTATTTGCTCTCTGTCA...,AATTTTTTAATATATTTCATTTAATCATCCTTTTTATTTTTTTAAT...
scaffold_241552,1241,21.27,23.05,unknown,unknown,unknown,1.0,unknown,1.0,unknown,...,1.0,unknown,1.0,No annotations.,GCATTAATTAAATCAACAGCTAATCTTTTAATTTGAATATCGTAAT...,ATATATGTTCATATTTGGAATTTTTGGATGTATTTTAGGATATATT...,GCATTAATTAAATCAACAGCTAATCTTTTAATTTGAATATCGTAAT...,TATTGTATCTTCTCGATTCATTTTTAACACTGCCTAATTAATATGT...,ATATATGTTCATATTTGGAATTTTTGGATGTATTTTAGGATATATT...,AAAAAAAGATAATAAATATTTAGTTTTGAATATTGATGATGAAATT...
scaffold_231049,1267,22.97,21.31,unknown,unknown,unknown,1.0,unknown,1.0,unknown,...,1.0,unknown,1.0,No annotations.,ATCATCATAACAACATATAACATCCATTTCATCTATCTCTTCACCT...,GAAAGAGAAAGTAGAAAAGAAATTTGATGGAAATGTAGTTGTATGG...,ATCATCATAACAACATATAACATCCATTTCATCTATCTCTTCACCT...,ATTTTTCATTTTATTTTCCTTCTTTTATTATATTCAACACAAACAA...,GAAAGAGAAAGTAGAAAAGAAATTTGATGGAAATGTAGTTGTATGG...,GATGGAGTGAAGAAGTATTTTGTGTTAGATGGTTGTTGGAGATGTA...
scaffold_58476,2464,22.22,24.72,unknown,unknown,unknown,0.9,unknown,0.9,unknown,...,0.9,unknown,0.9,One hypothetical protein annotated as belongin...,AGGAAAAGACGGATTAAATGGTATTGACGGAAAAGATGTTGACCCT...,CTCCAATTTTAAAAAATCAATATCAGCTATTTTAATTTCTATTCCT...,AGGAAAAGACGGATTAAATGGTATTGACGGAAAAGATGTTGACCCT...,ATTAAAATAATAATGAATAAAGAAACAAAAGAAAAATTATTTACCG...,CTCCAATTTTAAAAAATCAATATCAGCTATTTTAATTTCTATTCCT...,TCTTGTTTTTTATTAACAGAAGATAAATTATCAATAGCTACAACAT...
scaffold_51309,2631,25.77,24.9,unknown,unknown,unknown,0.6,unknown,0.6,unknown,...,0.6,unknown,0.6,Has two hypothetical proteins annotated with B...,GACAAGATAATAAAAGACGTAACTTGCAAAAAAAGATAAAATAAAA...,AACTGTAAATGTTGTAGTGGCTGTTCCATCTCGTCTGTAATACTCA...,GACAAGATAATAAAAGACGTAACTTGCAAAAAAAGATAAAATAAAA...,GATAATACAGTTAATATTCAAAATAAACAAATTGAGATATTAAATA...,AACTGTAAATGTTGTAGTGGCTGTTCCATCTCGTCTGTAATACTCA...,GTAAGAGTAACGAAATTATTATACCAAAAACATCTATGTCTTAATT...


In [22]:
len(s)

3891