--- Script fornecido pelo Grupo 7 e adaptado para o nosso trabalho ---

# *Klebsiella pneumoniae*

## Através do GenBank

In [125]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio import Entrez
from Bio import SeqIO

#genes = ["KPC-2", "CTX-M-14", "MdtC"]
#genes = ["KPHS_p200360", "KPHS_p100340", "KPHS_35880"]
genes = ["NC_016846.1", "NC_016838.1", "NC_016845.1"]

#Aceder ao NCBI
Entrez.email = 'pg45967@uminho.pt'
handle = Entrez.efetch(db = "nucleotide", id = "CP003200.1", rettype = "gb", retmode = "text")
seq_record = SeqIO.read(handle, "gb")

#Criar um ficheiro
SeqIO.write(seq_record, f"klebsiella_pneumoniae.gb", "gb")
handle.close()

#Aceder aos dados
info = SeqIO.read("klebsiella_pneumoniae.gb", "gb") 

f = open(f"klebsiella_pneumoniae.gb")
info = SeqIO.read(f, "gb")

#print(info)
print(f"Gene Id: {info.id}")
print(f"Description: {info.description}")

taxonomia = ''.join(f"{m} | " for m in info.annotations["taxonomy"])
print(f"Taxonomy: {taxonomia[:-2]}")

print()
for i, ref in enumerate(info.annotations["references"]): print(f"\t-------- REFERENCE {i + 1} --------\n\n{ref}")

Gene Id: CP003200.1
Description: Klebsiella pneumoniae subsp. pneumoniae HS11286, complete genome
Taxonomy: Bacteria | Proteobacteria | Gammaproteobacteria | Enterobacterales | Enterobacteriaceae | Klebsiella/Raoultella group | Klebsiella 

	-------- REFERENCE 1 --------

location: [0:5333942]
authors: Liu,P., Li,P., Jiang,X., Bi,D., Xie,Y., Tai,C., Deng,Z., Rajakumar,K. and Ou,H.Y.
title: Complete Genome Sequence of Klebsiella pneumoniae subsp. pneumoniae HS11286, a Multidrug-Resistant Strain Isolated from Human Sputum
journal: J. Bacteriol. 194 (7), 1841-1842 (2012)
medline id: 
pubmed id: 22408243
comment: 

	-------- REFERENCE 2 --------

location: [0:5333942]
authors: Ou,H.-Y., Jiang,X., Liu,P. and Li,P.
title: Direct Submission
journal: Submitted (14-DEC-2011) State Key Laboratory of Microbial Metabolism, Shanghai Jiaotong University, 1954 Huashan Road, Shanghai 200030, China
medline id: 
pubmed id: 
comment: 



In [130]:
#print(info.features)

data = []

for value in info.features:
    try:
        gene_name = value.qualifiers["gene"][0]
    except KeyError:
        continue
    if value.type == "CDS" and gene_name in genes:
        #print(value)
        print(f"Gene Name: {gene_name}")
        print("Type:", value.type)
        locus_tag = value.qualifiers["locus_tag"][0]
        print(f"Gene Locus: {locus_tag}")
        print(f"Location: {value.location}")
        protein_id = value.qualifiers["protein_id"][0]
        pt_product = value.qualifiers["product"][0]
        description = ''.join(f"Protein {protein_id} has the function: {pt_product}")
        protein = ''.join(value.qualifiers["translation"][0])
        print(f"{description}\n-------- Sequence --------\n{protein}")
        data.append([gene_name, locus_tag, value.type, value.location, protein_id, pt_product[:46], f"{protein[:20]}..."])
        print("\n\t--------------------//--------------------\n")
        
    """elif value.type == "rRNA" and gene_name in genes:
        #print(value)
        print(f"Gene Name: {gene_name}")
        print("Type:", value.type)
        print(f"Location: {value.location}")
        prot_obt = value.qualifiers["product"][0]
        print(f"Protein Obtained: {prot_obt}")
        data.append([gene_name, "----", value.type, value.location, "----", prot_obt[:46], "----"])
        print("\n\t--------------------//--------------------\n")"""

#for i in info.features:
    #print(i)

## Matriz com as infos

In [131]:
import numpy as np
import pandas as pd
from tabulate import tabulate

info_firstline = ["Gene Name", "Gene Locus", "Type", "Location", "Protein ID", "Protein Function", "Protein Seq"]

numpy_data = np.array(data)
numpy_data_frame = numpy_data.reshape((3, 7))

info_genes = pd.DataFrame(data = numpy_data_frame, columns = info_firstline)
print(tabulate(info_genes, headers='keys', tablefmt='psql'))

ValueError: cannot reshape array of size 0 into shape (3,7)

## Obtenção do ficheiro xml com info do BLAST

### Place in the input which gene you want to BLAST and retrieve the information

In [43]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW 

while True:
    try:
        x = input("Which gene are you looking for (KPC-2, CTX-M-14 or MdtC):")
        if x not in ["KPC-2", "CTX-M-14", "MdtC"]: 
            raise ValueError("Inserted gene not valid!")
        break
    except ValueError as error:
        print(error)

f = open(f"{x}.fasta")
seq = SeqIO.read(f, "fasta")

result_handle = NCBIWWW.qblast("blastn", "nt", seq.format("fasta"))
save_file = open(f"{x}_BLAST.xml", "w")
save_file.write(result_handle.read())
save_file.close()
result_handle.close()

Which gene are you looking for (KPC-2, CTX-M-14 or MdtC): MdtC


## Análise Ficheiro xml

In [44]:
from Bio.Blast import NCBIXML
import json

result_handle = open(f"{x}_BLAST.xml")
blast_records = NCBIXML.parse(result_handle)

allhits = []
hits = []
ids = []
for blast_record in blast_records:
    for alignment in blast_record.alignments: 
        for hsp in alignment.hsps:
            dict_all = {}
            info_hit = []
            if alignment.hit_id.split("|")[3] not in ids:
                info_hit.extend((alignment.hit_def, alignment.hit_id.split("|")[3], alignment.length, hsp.expect, hsp.score, alignment.accession))
                dict_all["Info"] = alignment.hit_def
                dict_all["Hit"] = alignment.hit_id.split("|")[3]
                ids.append(alignment.hit_id.split("|")[3])
                dict_all["Lenght"] = alignment.length
                dict_all["E-value"] = hsp.expect
                dict_all["Score"] = hsp.score
                dict_all["Accession Number"] = alignment.accession
                allhits.append(dict_all)
                hits.append(info_hit)
            
print(f"Number of hits: {len(allhits)}")
print(json.dumps(allhits, indent = 3))

Number of hits: 50
[
   {
      "Info": "Klebsiella pneumoniae strain F17KP0054 chromosome, complete genome",
      "Hit": "CP052136.1",
      "Lenght": 5357257,
      "E-value": 0.0,
      "Score": 6156.0,
      "Accession Number": "CP052136"
   },
   {
      "Info": "Klebsiella pneumoniae isolate 97ca48c2-b809-11e8-aae5-3c4a9275d6c8 genome assembly, chromosome: 1 >gi|1772506384|emb|LR736022.1| Klebsiella pneumoniae strain Kp_17035 genome assembly, chromosome: 1",
      "Hit": "LR596810.1",
      "Lenght": 5577545,
      "E-value": 0.0,
      "Score": 6156.0,
      "Accession Number": "LR596810"
   },
   {
      "Info": "Klebsiella pneumoniae strain JS187 chromosome, complete genome",
      "Hit": "CP025466.1",
      "Lenght": 5359967,
      "E-value": 0.0,
      "Score": 6156.0,
      "Accession Number": "CP025466"
   },
   {
      "Info": "Klebsiella pneumoniae strain JS706 chromosome, complete genome",
      "Hit": "CP070970.1",
      "Lenght": 5339678,
      "E-value": 0.0,
      

In [22]:
import numpy as np
import pandas as pd
from tabulate import tabulate

info_firstline = ["Organism", "Identifier", "Lenght", "E-Value", "Score", "Accession Number"]

numpy_data = np.array(hits)
numpy_data_frame = numpy_data.reshape((len(numpy_data), 6))

info_genes = pd.DataFrame(data = numpy_data_frame, columns = info_firstline)
print(tabulate(info_genes, headers='keys', tablefmt='psql'))

+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+----------+-----------+---------+--------------------+
|    | Organism                                                                                                                                                                                              | Identifier   |   Lenght |   E-Value |   Score | Accession Number   |
|----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+----------+-----------+---------+--------------------|
|  0 | Klebsiella pneumoniae strain F17KP0054 chromosome, complete genome                                                                                                   