In [1]:
# Montar Google Drive no Colab
from google.colab import drive
drive.mount('/content/drive')

# Depois de montar, navegue até a pasta desejada
import os
os.chdir('/content/drive/MyDrive/genomes_test/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m97.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [4]:
import sys
import subprocess
import pkg_resources
import os
from Bio import SeqIO
import csv
from tqdm import tqdm

# Função para verificar e instalar pacotes automaticamente
def install_and_import(package):
    try:
        pkg_resources.get_distribution(package)
    except pkg_resources.DistributionNotFound:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Pacotes necessários
required_packages = ['biopython', 'tqdm']

# Instalando os pacotes necessários
for package in required_packages:
    install_and_import(package)

# Caminhos no Google Drive (modifique conforme necessário)
annotation_file = "/content/drive/MyDrive/genomes_test/CreinhardtiiCC_4532_707_v6.1.annotation_info.txt"  # Arquivo de anotação
fasta_file = "/content/drive/MyDrive/genomes_test/CreinhardtiiCC_4532_707_v6.1.protein.fa"  # Arquivo FASTA do proteoma
output_dir = "/content/drive/MyDrive/genomes_test/"  # Diretório de saída

# Função para garantir que o diretório de saída existe
def ensure_output_dir(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

# Filtrar as proteínas defensin ou lectin no arquivo de anotação (busca flexível por palavra)
def filter_annotations(annotation_file):
    selected_proteins = {}
    with open(annotation_file, "r") as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            # Verifica se "defensin" ou "lectin" está em qualquer parte da descrição
            if 'eukaryotic translation' in row['Best-hit-rice-defline'].lower() or 'eukaryotic translation' in row['Best-hit-rice-defline'].lower():
                # Armazena o peptideName como chave e a descrição "Best-hit-rice-defline" como valor
                selected_proteins[row['peptideName']] = row['Best-hit-rice-defline']
    return selected_proteins

# Função para buscar sequências no proteoma e adicionar Best-hit-rice-defline ao cabeçalho
def extract_sequences(fasta_file, selected_proteins):
    sequences_found = []
    for record in SeqIO.parse(fasta_file, "fasta"):
        if record.id in selected_proteins:
            # Adiciona a descrição "Best-hit-rice-defline" ao cabeçalho da sequência
            record.description += f" | {selected_proteins[record.id]}"
            sequences_found.append(record)
    return sequences_found

# Função para salvar as sequências mineradas em um arquivo FASTA
def save_sequences(sequences, output_file):
    with open(output_file, "w") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")

# Função principal
def process_annotations_and_proteome():
    # Garantir que o diretório de saída existe
    ensure_output_dir(output_dir)

    # Filtrar as proteínas de interesse
    selected_proteins = filter_annotations(annotation_file)
    print(f"Total proteins found with 'defensin' or 'lectin' in description: {len(selected_proteins)}")

    # Buscar as sequências correspondentes no proteoma e adicionar o Best-hit-rice-defline ao cabeçalho
    sequences_found = extract_sequences(fasta_file, selected_proteins)
    print(f"Total sequences extracted: {len(sequences_found)}")

    # Salvar as sequências em um arquivo FASTA
    output_fasta = os.path.join(output_dir, "selected_proteins_with_description.fasta")
    save_sequences(sequences_found, output_fasta)
    print(f"Sequences saved to {output_fasta}")

# Executando o processamento
process_annotations_and_proteome()


Total proteins found with 'defensin' or 'lectin' in description: 30
Total sequences extracted: 30
Sequences saved to /content/drive/MyDrive/genomes_test/selected_proteins_with_description.fasta
