In [5]:
import pandas
import glob
import os

In [2]:
def load_input_metadata(metadata : str) -> pandas.DataFrame:
    input_metadata = pandas.read_csv(metadata)
    return input_metadata

In [6]:
print(os.getcwd())

/Users/mf019/bioinformatics/longread_GWAS/notebooks/OspC_typing


In [10]:

metadata = "../metadata/longread_metadata_v2.csv"

input_metadata = load_input_metadata(metadata)
known_types = input_metadata[['Isolate', 'OspC_Type']]
print(known_types)

             Isolate OspC_Type
0    5A18NP1-JBb08-A       NaN
1             UNY147         J
2    5A18NP1-JBb08-C       NaN
3    5A18NP1-JBb08-B       NaN
4    5A18NP1-JBb08-E       NaN
..               ...       ...
374              K60         A
375              K82         K
376              K11         K
377              K59         K
378              K69         A

[379 rows x 2 columns]


## Next Steps
Ok, so now what we'll do is take the table above, pull all isolates that have a known ospc_type, then pull the ospc annotations from each isolate, then make a tree, then confirm tree layout with known types, then run tree of all known ospc annotations, then use previous labels as classifications for each cluster.

> What are the characteristics that differentiate each OspC type.
>> Can I use motif scanning? Definitely MSA.

# Okay, Ira has sent me a large file of OspC types and AA seqs
I will be taking this, making a blast DB and then I need to isolate the OspC seqs from each assembly, then blast against the OspC database.

In [11]:

ospc_genotype_to_ref = {
     'A' : 'X69596',
   'AT1' : 'EU482041',
   'AT2' : 'EU482042',
   'AT3' : 'EU482043',
   'AT4' : 'EU482044',
   'AT5' : 'EU482045',
   'AT6' : 'EU482046',
   'AT7' : 'EU482047',
   'AT8' : 'EU482048',
   'AT9' : 'EU482049',
  'AT10' : 'EU482050',
  'AT11' : 'EU482051',
  'AT12' : 'EU482052',
  'AT13' : 'EU482053',
  'AT14' : 'EU482054',
  'AT15' : 'EU482055',
  'AT16' : 'EU482056',
    'Ba' : 'EF537413',
    'Bb' : 'NC_011724',
     'C' : 'DQ437462',
    'Da' : 'AF029863',
    'Db' : 'GQ478283',
     'E' : 'AY275221',
    'Fa' : 'AY275225',
    'Fb' : 'EF537433',
    'Fc' : 'GQ478285',
     'G' : 'AY275223',
    'Ha' : 'EU377781',
    'Hb' : 'GQ478286',
    'Ia' : 'AY275219',
    'Ib' : 'EU377752',
     'J' : 'CP001535',
     'K' : 'AY275214',
     'L' : 'EU375832',
     'M' : 'CP001550',
     'N' : 'EU377775',
     'O' : 'FJ997281',
     'T' : 'AY275222',
    'Ua' : 'EU377769',
    'Ub' : 'GQ478287',
    'A3' : 'EF592541',
    'B3' : 'EF592542',
    'C3' : 'EF592543',
    'D3' : 'EF592544',
    'E3' : 'EF592545',
    'F3' : 'EF592547',
    'H3' : 'FJ932733',
    'I3' : 'FJ932734',
    'B.bissettii_25015' : 'U04282'}
len(ospc_genotype_to_ref)

49

In [12]:
# ok now to pull all of those seqs via entrez direct and then make a multifasta file upon which to build a blastdb from
# we will use the following command to pull the sequences from the NCBI nucleotide database using our defined dictionary
# and we will also call the command using the biopython wrapper for entrez direct

#for ospc_type, ref in ospc_genotype_to_ref.items():
#    print(f"esearch -db nucleotide -query {ref} | efetch -format fasta > ospc_seqs/{ospc_type}.fasta")
#
from Bio import Entrez
Entrez.email = "mfoster11@mgh.harvard.edu"
#for ospc_type, ref in ospc_genotype_to_ref.items():
#    print(f"fetching record for {ospc_type} : {ref}")
#    handle = Entrez.efetch(db="nucleotide", id=ref, rettype="fasta", retmode="text")
#    record = handle.read()
#    with open(f"ospc_seqs/{ospc_type}.fasta", "w") as f:
#        print("writing record to file!")
#        f.write(record[0::])
#    handle.close()
#print("done!")

In [13]:
# ok so some of those entries include the entire plasmid sequence not just the ospc gene, so we will need to extract the ospc gene from the plasmid sequence
# the ones that are plasmid sequences are as follows:
# Bb, J, M,
# we will need to extract the ospc gene from these sequences

#for ospc_type in ['Bb', 'J', 'M']:
#    print(ospc_genotype_to_ref[ospc_type])
#
#coords = {
#    'Bb' : (16904,17540),
#    'J' : (16909,17545),
#    'M' : (16916,17555)
#}
#
## ok we already have the full plasmid seqs for each of these three types, we also now have the coordinates
from Bio import SeqIO
#
#for ospc_type in ['Bb', 'J', 'M']:
#    print(f"extracting ospc gene from {ospc_type} sequence")
#    with open(f"ospc_seqs/{ospc_type}.fasta", "r") as f:
#        record = SeqIO.read(f, "fasta")
#        print(record)
#        ospc_gene = record[coords[ospc_type][0]:coords[ospc_type][1]]
#        with open(f"ospc_seqs/{ospc_type}_ospc.fasta", "w") as f:
#            SeqIO.write(ospc_gene, f, "fasta")

In [19]:
import glob

ospc_seqs = glob.glob("ospc_seqs/*.fasta")
print(ospc_seqs)
ospc_seq_file = []
for seq in ospc_seqs:
    record = SeqIO.read(seq, "fasta")
    record.id = "OspC_Type-" + seq.split("/")[-1].split(".")[0]
    print(record.id)
    ospc_seq_file.append(record)
with open("all_ospc.fasta", "w") as f:
    SeqIO.write(ospc_seq_file, f, "fasta")

['ospc_seqs/A.fasta', 'ospc_seqs/AT15.fasta', 'ospc_seqs/I3.fasta', 'ospc_seqs/AT1.fasta', 'ospc_seqs/E3.fasta', 'ospc_seqs/Fa.fasta', 'ospc_seqs/Fc.fasta', 'ospc_seqs/AT3.fasta', 'ospc_seqs/C.fasta', 'ospc_seqs/AT13.fasta', 'ospc_seqs/G.fasta', 'ospc_seqs/Bb.fasta', 'ospc_seqs/AT7.fasta', 'ospc_seqs/AT5.fasta', 'ospc_seqs/AT11.fasta', 'ospc_seqs/Da.fasta', 'ospc_seqs/E.fasta', 'ospc_seqs/Ha.fasta', 'ospc_seqs/B.bissettii_25015.fasta', 'ospc_seqs/AT2.fasta', 'ospc_seqs/Fb.fasta', 'ospc_seqs/AT16.fasta', 'ospc_seqs/C3.fasta', 'ospc_seqs/AT14.fasta', 'ospc_seqs/Ba.fasta', 'ospc_seqs/A3.fasta', 'ospc_seqs/AT4.fasta', 'ospc_seqs/AT10.fasta', 'ospc_seqs/Db.fasta', 'ospc_seqs/AT12.fasta', 'ospc_seqs/Hb.fasta', 'ospc_seqs/AT6.fasta', 'ospc_seqs/J.fasta', 'ospc_seqs/Ub.fasta', 'ospc_seqs/H3.fasta', 'ospc_seqs/D3.fasta', 'ospc_seqs/AT8.fasta', 'ospc_seqs/F3.fasta', 'ospc_seqs/Ia.fasta', 'ospc_seqs/L.fasta', 'ospc_seqs/N.fasta', 'ospc_seqs/Ua.fasta', 'ospc_seqs/AT9.fasta', 'ospc_seqs/B3.fasta', 

In [20]:
# ok now lets make a local blast DB using biopython
from Bio.Blast.Applications import NcbimakeblastdbCommandline
cmd = NcbimakeblastdbCommandline(input_file="all_ospc.fasta", dbtype="nucl", out="ospc_blastdb/ospc")
print(cmd)
stdout, stderr = cmd()
print(stdout)
print(stderr)

makeblastdb -out ospc_blastdb/ospc -dbtype nucl -in all_ospc.fasta


Building a new DB, current time: 03/04/2024 11:11:38
New DB name:   /Users/mf019/bioinformatics/longread_GWAS/notebooks/OspC_typing/ospc_blastdb/ospc
New DB title:  all_ospc.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Users/mf019/bioinformatics/longread_GWAS/notebooks/OspC_typing/ospc_blastdb/ospc
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 49 sequences in 0.0098331 seconds.






In [70]:
# ok now lets pull all of the ospCs from all assemblies and then blast them against our local blastdb!
# we will use the following command to pull the sequences from the NCBI nucleotide database using our defined dictionary
print(os.getcwd())
assemblies = glob.glob("../../assemblies/*/annotation/*/*.gbff")
print(assemblies)

ospc_identifiers = [
    "surface lipoprotein",
    "outer surface protein",
    "outer surface lipoprotein",
    "outer surface protein",
    "Major outer surface protein",
    "ospC",
    "OspC",
    "Ospc",
    "ospc",
    "outer surface protein C",
    "Outer surface protein C",
    "outer surface lipoprotein C",
    "Outer surface lipoprotein C",
    "outer surface protein c",
]

/Users/mf019/bioinformatics/longread_GWAS/notebooks/OspC_typing
['../../assemblies/pacbio/annotation/MR616/MR616.gbff', '../../assemblies/pacbio/annotation/B356/B356.gbff', '../../assemblies/pacbio/annotation/B500/B500.gbff', '../../assemblies/pacbio/annotation/MC123/MC123.gbff', '../../assemblies/pacbio/annotation/B247/B247.gbff', '../../assemblies/pacbio/annotation/B418/B418.gbff', '../../assemblies/pacbio/annotation/MC149/MC149.gbff', '../../assemblies/pacbio/annotation/MR614/MR614.gbff', '../../assemblies/pacbio/annotation/MC104/MC104.gbff', '../../assemblies/pacbio/annotation/MC105/MC105.gbff', '../../assemblies/pacbio/annotation/MR641/MR641.gbff', '../../assemblies/pacbio/annotation/BL224/BL224.gbff', '../../assemblies/pacbio/annotation/B348/B348.gbff', '../../assemblies/pacbio/annotation/B331/B331.gbff', '../../assemblies/pacbio/annotation/BL206/BL206.gbff', '../../assemblies/pacbio/annotation/BL522/BL522.gbff', '../../assemblies/pacbio/annotation/B483/B483.gbff', '../../assembl

In [79]:
ospc_genes = {}
for file in assemblies:
    #print(file)
    sample_id = file.split("/")[-1].split(".")[0].split("_")[0]
    ospc_genes[sample_id] = []
    with open(file, "r") as f:
        record = SeqIO.parse(f, "genbank")
        for rec in record:
            #print(rec.id)
            for feature in rec.features:
                if feature.type == "CDS":
                    product = feature.qualifiers['product']
                    if any(x in product[0] for x in ospc_identifiers):
                        #feature.qualifiers['product']
                        # get sequence
                        #print(feature.location)
                        ospc_genes[sample_id].append({product[0]:feature.extract(rec.seq)})

In [80]:
import pprint
pprint.pprint(ospc_genes)

{'5A18NP1-JBb08-A': [{'outer surface protein OspD': Seq('ATGAAAAAATTAATAAAAATACTACTGTTAAGTTTATTTTTATTGCTCTCAATA...TAA')},
                     {'outer surface lipoprotein OspA': Seq('ATGAAAAAATATTTATTGGGAATAGGTCTAATATTAGCCTTAATAGCATGTAAG...TAA')},
                     {'outer surface lipoprotein OspA': Seq('ATGAAAAAATATTTATTGGGAATAGGTCTAATATTAGCCTTAATAGCATGTAAG...TAA')},
                     {'outer surface protein OspC': Seq('ATGAAAAAGAATACATTAAGTGCAATATTAATGACTTTATTTTTATTTATATCT...TAA')},
                     {'outer surface protein OspC': Seq('ATGAAAAAGAATACATTAAGTGCAATATTAATGACTTTATTTTTATTTATATCT...TAA')},
                     {'Borrelia outer surface protein E': Seq('ATGAACAATGTTTCAGAAAAAAATCAAGAAATGCAAAATAATATTCAAGCAAAA...TGA')},
                     {'outer surface protein OspD': Seq('ATGAAAAAATTAATAAAAATACTACTGTTAAGTTTATTTTTATTGCTCTCAATA...TAA')},
                     {'outer surface protein OspD': Seq('ATGAAAAAATTAATAAAAATACTACTGTTAAGTTTATTTTTATTGCTCTCAATA...TAA')},
          