In [2]:
import os
import ssl
import certifi
import urllib.request

# Força o uso do bundle de certificados do certifi
os.environ["SSL_CERT_FILE"] = certifi.where()

# Verificação da URL segura
print("Certificado SSL forçado para:", os.environ["SSL_CERT_FILE"])
response = urllib.request.urlopen("https://www.ncbi.nlm.nih.gov")
print("Conexão segura com NCBI! Status:", response.status)


Certificado SSL forçado para: /Users/mariacarvalho/Documents/GitHub/Bioinformatics-Project/.venv/lib/python3.9/site-packages/certifi/cacert.pem
Conexão segura com NCBI! Status: 200


This script aims to retrieve and process phage genome information that infect *Escherichia coli* from the GenBank database. It uses the Biopython library to query GenBank for relevant genome entries and extracts information for each phage genome.

In [33]:
import pandas as pd
import re
from Bio import Entrez
from Bio import SeqIO

Entrez.email = "pocawaves@gmail.com"

def search_phages_hosts(host):
    query = f'({host}[host]) OR ({host}[lab_host])'
    handle = Entrez.esearch(db="nucleotide", term=f"{host}[host] AND phage[title]", retmax=100)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def genomes(ids):
    data = []

    for id in ids:
        handle = Entrez.efetch(db="nucleotide", id=id, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")
        handle.close()
        print(f"Genome: {record.id} - {record.description}")

        accession_number = record.id
        description = record.description

        host = None
        lab_host = None

        for feature in record.features:
            if feature.type == "source":
                host = feature.qualifiers.get("host", [None])[0]
                lab_host = feature.qualifiers.get("lab_host", [None])[0]


        phage_description = re.sub(r"(complete genome|Escherichia|,)", "", description).strip()

        data.append([accession_number, host, lab_host, phage_description])
    
    return data

ids_fagos = search_phages_hosts("Escherichia coli")

data = genomes(ids_fagos)

Genome: PV287706.1 - Escherichia phage Midge, complete genome
Genome: PV252060.1 - Escherichia phage nithesis, complete genome
Genome: PV204681.1 - Escherichia phage vB_EcoM_JQD51, complete genome
Genome: PP453689.1 - Escherichia phage vB_EcoP_LHP, complete genome
Genome: PV245935.1 - Escherichia phage kaset, complete genome
Genome: PV245934.1 - Escherichia phage sutha, complete genome
Genome: PV245933.1 - Escherichia phage nasanit, complete genome
Genome: PQ738613.1 - Escherichia phage PhiKM, partial genome
Genome: PV287714.1 - Escherichia phage HMD-9, complete genome
Genome: PV191315.1 - Escherichia phage vB_EcoM-P3322, complete genome
Genome: PQ474795.1 - Escherichia phage vB_Eco_AMO_3701M, complete genome
Genome: NC_105115.1 - Escherichia phage IrmaTschudi, complete genome
Genome: NC_105113.1 - Escherichia phage GreteKellenberger, complete genome
Genome: PP481222.1 - Escherichia phage vB_EcoM_BL, complete genome
Genome: PP481221.1 - Escherichia phage vB_EcoP_IUE, complete genome
Ge

This part of the script organizes the data into a **pandas DataFrame**. The DataFrame is structured with three columns: **Accession Number**, **Host**, **"Lab_Host"** and **Phage**. This format makes it easier to analyze, manipulate, and export the data as needed.

In [35]:
df = pd.DataFrame(data, columns=["Accession", "Host", "Lab_Host", "Phage Description"])

df

Unnamed: 0,Accession,Host,Lab_Host,Phage Description
0,PV287706.1,Escherichia coli 4s,,phage Midge
1,PV252060.1,Escherichia coli,Escherichia coli M493,phage nithesis
2,PV204681.1,Escherichia coli O157,,phage vB_EcoM_JQD51
3,PP453689.1,Escherichia coli K12 (RP4),,phage vB_EcoP_LHP
4,PV245935.1,Escherichia coli,Escherichia coli M491,phage kaset
...,...,...,...,...
95,PQ657771.1,Escherichia coli,,phage EC.P3
96,PQ551090.1,Escherichia coli CB10,,phage GR8
97,PQ488502.1,Escherichia coli,,phage BUCT632
98,PQ783785.1,Escherichia coli HS(pFamp)R (ATCC 700891),,phage PDP46


In [36]:
df.to_csv("phage_host.csv", index=False)