In [1]:
import os
import glob
import shutil
import pandas
import pprint
import subprocess
from pathlib import Path
from Bio            import Entrez
from Bio            import SeqIO
from Bio.Blast      import NCBIXML
from collections import defaultdict

############################################
Entrez.email = "mfoster11@mgh.harvard.edu" #
############################################

In [2]:
#Appl Environ Microbiol. 2008 Jun 6;74(16):5008–5014. doi: 10.1128/AEM.00479-08
# "... For ospC, alignments were made on the translated amino acid sequences and then back-translated to nucleotide sequences to ensure in-frame nucleotide alignment..."
# I was told just use nucl identity since most are incomplete and frame cannot be guaranteed.

iras_types = { 
    'A' : 'X69596',   'Ba' : 'EF537413', 'Bb' : 'NC_011724', 'C' : 'DQ437462',
    'Da' : 'AF029863', 'Db' : 'GQ478283',  'E' : 'AY275221', 'Fa' : 'AY275225',
    'Fb' : 'EF537433', 'Fc' : 'GQ478285',  'G' : 'AY275223', 'Ha' : 'EU377781',
    'Hb' : 'GQ478286', 'Ia' : 'AY275219', 'Ib' : 'EU377752',  'J' : 'CP001535',
     'K' : 'AY275214',  'L' : 'EU375832',  'M' : 'CP001550',  'N' : 'EU377775',
     'O' : 'FJ997281',  'T' : 'AY275222', 'Ua' : 'EU377769', 'Ub' : 'GQ478287',
    'A3' : 'EF592541', 'B3' : 'EF592542', 'C3' : 'EF592543', 'D3' : 'EF592544',
    'E3' : 'EF592545', 'F3' : 'EF592547', 'H3' : 'FJ932733', 'I3' : 'FJ932734',
    'B_bissettii25015': 'U04282'
}

#Appl Environ Microbiol. 2013 Mar;79(5):1444–1453. doi: 10.1128/AEM.02749-12
ospc_ref_file = "reference_types.tsv"
hanincovas_types = pandas.read_csv(ospc_ref_file, sep='\t')

In [22]:
seqs = "ref_ospc_seqs_v2"
if not os.path.exists(seqs):
    os.mkdir(seqs)

type_to_acc = defaultdict(list)

for _, data in hanincovas_types.iterrows():
    ospc_type = data['ospC_type']
    gb_acc = data["gb_acc"]
    type_to_acc[ospc_type].append(gb_acc)

In [23]:
value_to_keys = {}

for key, values in type_to_acc.items():
    for acc in values:
        if acc not in value_to_keys:
            value_to_keys[acc] = {'dict1': [key], 'dict2': []}
        else:
            value_to_keys[acc]['dict1'].append(key)

for key, value in iras_types.items():
    if value not in value_to_keys:
        value_to_keys[value] = {'dict1': [], 'dict2': [key]}
    else:
        value_to_keys[value]['dict2'].append(key)

unique_values = set(list(item for value in type_to_acc.values() for item in value)) | set(iras_types.values())

duplicates = {
    value: keys for value, keys in value_to_keys.items() if keys['dict1'] and keys['dict2']
}

pprint.pprint(duplicates)
# {'AF029863': {'dict1': ['D'], 'dict2': ['Da']},
# 'DQ437462': {'dict1': ['C'], 'dict2': ['C']},
# 'EF537413': {'dict1': ['B1'], 'dict2': ['Ba']},
# 'FJ932733': {'dict1': ['H3'], 'dict2': ['H3']},
# 'FJ932734': {'dict1': ['I3'], 'dict2': ['I3']}}

#print(unique_values)

acc_to_types = {
    value: key for value, key in value_to_keys.items() #if value not in duplicates.keys()
}

{'AF029863': {'dict1': ['D'], 'dict2': ['Da']},
 'DQ437462': {'dict1': ['C'], 'dict2': ['C']},
 'EF537413': {'dict1': ['B1'], 'dict2': ['Ba']},
 'FJ932733': {'dict1': ['H3'], 'dict2': ['H3']},
 'FJ932734': {'dict1': ['I3'], 'dict2': ['I3']}}


In [24]:
weird_types = []
fixed_type_to_ref = defaultdict(list)
for acc, dicts in acc_to_types.items():
    types = [val for val in dicts.values() if val != []]
    if len(types) > 1:
        if types[0] == types[1]:
            #pass This is why my counts were off!! 
            key = types[0][0]
            fixed_type_to_ref[key].append(acc)
        else:
            #print(acc, types)
            weird_types.append(f"{acc}: {types}")
            
    else:
        #print(acc, *types[0])
        key = types[0][0]
        fixed_type_to_ref[key].append(acc)

In [25]:
print(weird_types) # ["EF537413: [['B1'], ['Ba']]", "AF029863: [['D'], ['Da']]"]
#pprint.pprint(fixed_type_to_ref)
# I could do a coin flip but might as well preserve both rather than one or the other? this is irritating.
fixed_type_to_ref['B1'] = ['EF537413'] # weird one that's just gonna have to live like this. I hope I don't split types by dashes downstream...
fixed_type_to_ref['Da'] = ['AF029863'] # This can just be Da.

["EF537413: [['B1'], ['Ba']]", "AF029863: [['D'], ['Da']]"]


In [27]:
ospc_types = list(fixed_type_to_ref.keys())
print(sorted(ospc_types))
print(len(ospc_types)) # 47
# Input: 
# ['A', 'A3', 'B', 'B(nt59)', 'B1', 'B2', 'B3', 'Bb', 'C', 'C3', 'D', 'D3', 'Da', 'Db', 'E', 'E3', 
# 'F', 'F3', 'Fa', 'Fb', 'Fc', 'G', 'H', 'H3', 'Ha', 'Hb', 'I', 'I3', 'Ia', 'Ib', 'J', 'K', 'L', 'M', 
# 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'Ua', 'Ub', 'V', 'W', 'X']

['A', 'A3', 'B', 'B(nt59)', 'B1', 'B2', 'B3', 'Bb', 'C', 'C3', 'D', 'D3', 'Da', 'Db', 'E', 'E3', 'F', 'F3', 'Fa', 'Fb', 'Fc', 'G', 'H', 'H3', 'Ha', 'Hb', 'I', 'I3', 'Ia', 'Ib', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'Ua', 'Ub', 'V', 'W', 'X']
47


In [253]:
for ospc_type, accs in fixed_type_to_ref.items():
    out_file = f"{seqs}/{ospc_type}.fasta"
    if not os.path.exists(out_file):
        with open(out_file, "w") as f:
            print(f"fetching records for {ospc_type} : {accs}")
            for acc in accs:
                handle = Entrez.efetch(db="nucleotide", id=acc, rettype="fasta", retmode="text")
                record = handle.read()
                #print(f"writing record: {acc} to file!")
                f.write(record[0::])
                handle.close()
    else:
        print(f"NOT fetching record for {ospc_type} !! ALREADY EXISTS!!")

NOT fetching record for A !! ALREADY EXISTS!!
NOT fetching record for B(nt59) !! ALREADY EXISTS!!
NOT fetching record for B2 !! ALREADY EXISTS!!
NOT fetching record for B !! ALREADY EXISTS!!
fetching records for C : ['AF029862', 'DQ437462', 'EU482043']
NOT fetching record for D !! ALREADY EXISTS!!
NOT fetching record for E3 !! ALREADY EXISTS!!
NOT fetching record for E !! ALREADY EXISTS!!
NOT fetching record for F !! ALREADY EXISTS!!
NOT fetching record for G !! ALREADY EXISTS!!
NOT fetching record for H !! ALREADY EXISTS!!
fetching records for H3 : ['FJ932733']
NOT fetching record for I !! ALREADY EXISTS!!
fetching records for I3 : ['FJ932734']
NOT fetching record for J !! ALREADY EXISTS!!
NOT fetching record for K !! ALREADY EXISTS!!
NOT fetching record for L !! ALREADY EXISTS!!
NOT fetching record for M !! ALREADY EXISTS!!
NOT fetching record for N !! ALREADY EXISTS!!
NOT fetching record for O !! ALREADY EXISTS!!
NOT fetching record for P !! ALREADY EXISTS!!
NOT fetching record for 

In [52]:
#strain = 'B_bissettii25015'
#acc = 'U04282'
#
#with open(f'{seqs}/{strain}.fasta', 'w') as f:
#    handle = Entrez.efetch(db="nucleotide", id=acc, rettype="fasta", retmode="text")
#    record = handle.read()
#    print(f"writing record: {acc} to file!")
#    f.write(record[0::])
#    handle.close()

writing record: U04282 to file!


In [192]:
# ok so some of those entries include the entire plasmid sequence not just the ospc gene, so we will need to extract the ospc gene from the plasmid sequence
# the ones that are plasmid sequences are as follows:
# Bb, J, M
# Bb is only one seq, 
# J has multiple ospC seqs and then cp26, header of plasmid is CP001535.1
# manually changing file name from J.fasta to J_good.fasta
# Extracting the full cp26 sequence to J.fasta,
# running the below code to append to J_good.fasta
# renaming J_good.fasta back to J.fasta

# M has multiple ospC seqs and then cp26, header of plasmid is >CP001550.1
# manually changing file name from M.fasta to M_good.fasta
# Extracting the full cp26 sequence to M.fasta,
# running the below code to append to M_good.fasta,
# renaming M_good.fasta back to M.fasta

# while I'm manually editing sequences, I also added the following to the B1a header: 
# "ospC-B1/ospC-Ba allele" to reflect that it is actually both. 

for ospc_type in ['Bb', 'J', 'M']:
    print(fixed_type_to_ref[ospc_type])
coords = {
    # THESE ARE ZERO BASED!!!
    'Bb': (16904,17540), # from 16905..17540
    'J': (16909,17545),  # from 16910..17545
    'M': (16916,17555),  # from 16917..17555
}

# because Bb is a single sequence, we gotta do some shenanigans.
with open(f"{seqs}/Bb_good.fasta", 'a'):
    pass

#
## ok we already have the full plasmid seqs for each of these three types, we also now have the coordinates
for ospc_type in ['Bb', 'J', 'M']:
    #print(f"extracting ospc gene from {ospc_type} sequence")
    with open(f"{seqs}/{ospc_type}.fasta", "r") as f:
        record = SeqIO.read(f, "fasta")
        #print(record)
        ospc_gene = record[coords[ospc_type][0]:coords[ospc_type][1]]
        ospc_gene.description = ospc_gene.description.split('plasmid')[0]+f'outer surface protein C (ospC) gene, ospC-{ospc_type} allele'
        with open(f"{seqs}/{ospc_type}_good.fasta", "a") as f:
            SeqIO.write(ospc_gene, f, "fasta")
        shutil.move(f"{seqs}/{ospc_type}_good.fasta", f"{seqs}/{ospc_type}.fasta")

['NC_011724']
['AF029870', 'DQ437444', 'EU482050', 'U91802', 'CP001535']
['EU482052', 'U01892', 'CP001550']


In [8]:
def find_best_frame_both_strands(sequence):
    """Check both strands and all frames, preserve stop codons for inspection"""
    forward_frames = [(i, str(sequence[i:].translate(table=11)), 'forward') for i in range(3)]
    reverse_frames = [(i, str(sequence.reverse_complement()[i:].translate(table=11)), 'reverse') for i in range(3)]
    
    all_frames = forward_frames + reverse_frames
    
    # Sort by number of stops
    sorted_frames = sorted(all_frames, key=lambda x: x[1].count('*'))
    
    #for frame, translation, direction in sorted_frames:
    #    stops = translation.count('*')
    #    if stops > 1:
    #        print(f"{direction} frame {frame}: {stops} stops")
    #        print(translation)
    #        print()
    
    return sorted_frames[0]  # (frame, translation, direction) of best frame

In [55]:
# Now lets make our multifasta:
rec_count = 0
written_ids = []
ospc_seqs = glob.glob(f"{seqs}/*.fasta")
ospc_seq_file_nucl = []
ospc_seq_file_aa = []
for seq in ospc_seqs:
    records = SeqIO.parse(seq, "fasta")
    for record in records:
        written_ids.append(record.id)
        rec_count += 1
        record.id = f"{record.id}_OspC-" + seq.split("/")[-1].split(".")[0]
        record.description = ""
        ospc_seq_file_nucl.append(record)
        
        #while len(record.seq) % 3 != 0:
        #   record.seq += 'N'
        ####record.seq = record.seq.translate(table=11) # use for outputting amino acids.
        #protein = find_best_frame_both_strands(record.seq)
        #record.seq = protein[1].replace('*', '')
        #ospc_seq_file_aa.append(record)
        
print(rec_count)

with open("all_ospc_raw_v5.fna", "w") as f:
    SeqIO.write(ospc_seq_file_nucl, f, "fasta")
print("OspC nucleotide file written!")

#with open("all_ospc_raw_v4.faa", "w") as f:
#    SeqIO.write(ospc_seq_file_aa, f, "fasta")
#print("OspC amino acid file written!")

128
OspC nucleotide file written!


In [56]:
print(len(value_to_keys.keys())) # 127
print(len([item[0][0] for value in fixed_type_to_ref.values() for item in value])) # 124
print(len([rec.id for rec in SeqIO.parse("all_ospc_raw_v5.fna", "fasta")]))
print(len([rec.id for rec in SeqIO.parse("all_ospc_raw_v4.faa", "fasta")]))
all_ids = sorted(value_to_keys.keys())

127
127
128
127


In [34]:
#missing = set(all_ids) - set([x.split('.')[0] for x in written_ids]) # {'DQ437462', 'FJ932733', 'FJ932734'}
#for acc in missing:
#   print(value_to_keys[acc])
# {'dict1': ['C'], 'dict2': ['C']}
# {'dict1': ['H3'], 'dict2': ['H3']}
# {'dict1': ['I3'], 'dict2': ['I3']}

# THIS IS FIXED BY COMMENTING OUT THE PASS IN DICT CREATION AND ACTUALLY INCLUDING IT.

In [57]:
# OKAY NOW TIME FOR CD-HIT
# Run inside of docker container ;)
current_dir = os.getcwd()
for ident in range(90,101):
    output_path_nucl = f'ospC_ref_clustering_v5/nucl/{ident}'
    output_path_aa = f'ospC_ref_clustering_v4/aa/{ident}'
    
    os.makedirs(output_path_nucl, exist_ok=True)
    os.makedirs(output_path_aa, exist_ok=True)
        
    docker_cmd_nucl = [
        'docker', 'run', '-v', f'{current_dir}:/data', '-w', '/opt', 'mjfos2r/cd-hit',
        '/bin/bash', '-c',
        f'cd-hit-est -i /data/all_ospc_raw_v5.fna -o /data/{output_path_nucl}/ospC_v5_{ident} -c {ident/100} -n 5 -g 1 -d 40'
    ]
    docker_cmd_aa = [
        'docker', 'run', '-v', f'{current_dir}:/data', '-w', '/opt', 'mjfos2r/cd-hit',
        '/bin/bash', '-c',
        f'cd-hit -i /data/all_ospc_raw_v4.faa -o /data/{output_path_aa}/ospC_v4_{ident} -c {ident/100} -n 5 -g 1 -d 40 ',
    ]
        
    result = subprocess.run(docker_cmd_nucl, capture_output=True, text=True)
    print('STDOUT:\n', result.stdout)
    print('STDERR:\n', result.stderr)
    result = subprocess.run(docker_cmd_aa, capture_output=True, text=True)
    print('STDOUT:\n', result.stdout)
    print('STDERR:\n', result.stderr)

STDOUT:
Program: CD-HIT, V4.8.1 (+OpenMP), Nov 22 2024, 20:06:10
Command: cd-hit-est -i /data/all_ospc_raw_v5.fna -o
         /data/ospC_ref_clustering_v5/nucl/90/ospC_v5_90 -c 0.9
         -n 5 -g 1 -d 40

Started: Mon Dec  9 22:31:16 2024
                            Output                              
----------------------------------------------------------------
total seq: 128
longest and shortest : 1150 and 476
Total letters: 72904
Sequences have been sorted

Approximated minimal memory consumption:
Sequence        : 0M
Buffer          : 1 X 12M = 12M
Table           : 1 X 0M = 0M
Miscellaneous   : 0M
Total           : 12M

Table limit with the given memory limit:
Max number of representatives: 1184090
Max number of word counting entries: 98449733


comparing sequences from          0  to        128

      128  finished         31  clusters

Approximated maximum memory consumption: 12M
writing new database
writing clustering information
program completed !

Total CPU time 0.23



In [61]:
# cat ospC_v4_100.fna | grep ">" | sed 's/.*-\(.*\)/\1/' | sort | uniq
# Lets check our clusters!
# V4_all:
# ['A', 'A3', 'B', 'B(nt59)', 'B.bissettii_25015', 'B2', 'B3', 'Bb', 'C', 'C3', 'D', 'D3', 'Db', 'E', 'E3', 
# 'F', 'F3', 'Fa', 'Fb', 'Fc', 'G', 'H', 'Ha', 'Hb', 'I', 'Ia', 'Ib', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 
# 'Q', 'R', 'S', 'T', 'U', 'Ua', 'Ub', 'V', 'W', 'X']

# V5_all:
#[ 'A', 'A3', 'B', 'B(nt59)', 'B1', 'B2', 'B3', 'Bb', 'C', 'C3', 'D', 'D3', 'Da', 'Db', 'E', 'E3', 
# 'F', 'F3', 'Fa', 'Fb', 'Fc', 'G', 'H', 'H3', 'Ha', 'Hb', 'I', 'I3', 'Ia', 'Ib', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 
# 'Q', 'R', 'S', 'T', 'U', 'Ua', 'Ub', 'V', 'W', 'X' ] 

# V5_nucl Consensus: (at 100% ident)
# ['A', 'A3', 'B', 'B(nt59)', 'B2', 'B3', 'Bb', 'C', 'C3', 'D', 'D3', 'Da', 'E', 'E3', 
#  'F', 'F3', 'Fb', 'Fc', 'G', 'H', 'H3', 'Hb', 'I', 'I3', 'Ib', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 
#  'Q', 'R', 'S', 'T', 'U', 'Ub', 'V', 'W', 'X']

v5_input = [ 'A', 'A3', 'B', 'B(nt59)', 'B1', 'B2', 'B3', 'Bb', 'C', 'C3', 'D', 'D3', 'Da', 'Db', 'E', 'E3', 
 'F', 'F3', 'Fa', 'Fb', 'Fc', 'G', 'H', 'H3', 'Ha', 'Hb', 'I', 'I3', 'Ia', 'Ib', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 
 'Q', 'R', 'S', 'T', 'U', 'Ua', 'Ub', 'V', 'W', 'X', 'B_bissettii25015'] 

v5_types = ['A', 'A3', 'B', 'B(nt59)', 'B2', 'B3', 'Bb', 'C', 'C3', 'D', 'D3', 'Da', 'E', 'E3', 
  'F', 'F3', 'Fb', 'Fc', 'G', 'H', 'H3', 'Hb', 'I', 'I3', 'Ib', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 
  'Q', 'R', 'S', 'T', 'U', 'Ub', 'V', 'W', 'X', 'B_bissettii25015']

print(len(v5_input))
print(len(v5_types_nucl))

print(sorted(set(v5_input) - set(v5_types))) # ['B1', 'Db', 'Fa', 'Ha', 'Ia', 'Ua']
# FIXING MANUALLY SEE BELOW.

48
41
['B1', 'Db', 'Fa', 'Ha', 'Ia', 'Ua']


In [342]:
# ok time to run clustal and see what's going on. 
clustal_cmd = [
    'docker', 'run',
    '-v', f'{current_dir}:/data',
    '-w', '/data',
    'dnalinux/clustalo',
    '/bin/bash', '-c',
    f'clustalo -i /data/all_ospc_raw_v4.faa -o /data/all_ospc_raw_v4.faa.aln --force'
]
subprocess.run(clustal_cmd)

CompletedProcess(args=['docker', 'run', '-v', '/home/mf019/longread_pangenome/OspC_typing:/data', '-w', '/data', 'dnalinux/clustalo', '/bin/bash', '-c', 'clustalo -i /data/all_ospc_raw_v4.faa -o /data/all_ospc_raw_v4.faa.aln --force'], returncode=0)

In [60]:
# B1 is clustered with other B types so we shall name it B1 since we have B2 and B3. There is no Ba. 
#>Cluster 16
#0	583nt, >AF029861.1_OspC-B... *
#1	504nt, >EU482042.1_OspC-B... at +/100.00%
#2	537nt, >EF537413.1_OspC-B1... at +/100.00%

# B2 and Bb are also the same cluster. 
# Bb is B2 and will be numbered as such.
#>Cluster 6
#0	636nt, >NC_011724.1_OspC-Bb... *
#1	522nt, >U91795.1_OspC-B... at +/100.00%
#2	579nt, >L42868.1_OspC-B2... at +/100.00%

# D is also causing trouble.. 
# D renamed to Db,
#>Cluster 0
#0	1150nt, >L25413.1_OspC-D... *
#1	531nt, >GQ478283.1_OspC-Db... at +/100.00%

## D renamed to Da (cluster named correctly)
#>Cluster 41
#0	504nt, >EU482044.1_OspC-D... at +/100.00%
#1	555nt, >AF029863.1_OspC-Da... *

## D3 is unchanged but should be updated to Dc for consistency.
#>Cluster 59
#0	531nt, >EF592544.1_OspC-D3... *

# Actually none of this makes any sense because these classifications are all over the damn place.
# lets return to Iras' set and look at the lengths he provided.
# update: they're mostly incomplete CDS as well. 
# ok. Amine recommended that we just roll with 98% and rename the clusters. I think I'm going to just drop off the subtype
# and stick to whatever the major type is, regardless of cluster identity. 
# # If it's cluster 16- B or if its Cluster 6- B, it's type B.
# # This is not for me to resolve at this time.