In [None]:
import csv
from bioservices import UniProt
import os
import re
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_gene_list(file_path="genes.txt"):
    """Reads a list of genes from a specified text file."""
    try:
        with open(file_path, "r") as file:
            genes = [line.strip() for line in file if line.strip()]
        if not genes:
            logging.warning("Gene list is empty.")
        return genes
    except FileNotFoundError:
        logging.error(f"File {file_path} not found.")
        return []

def get_organism_list(file_path="organisms.txt"):
    """Reads a list of organisms from a specified text file."""
    try:
        with open(file_path, "r") as file:
            organisms = [line.strip() for line in file if line.strip()]
        if not organisms:
            logging.warning("Organism list is empty.")
        return organisms
    except FileNotFoundError:
        logging.error(f"File {file_path} not found.")
        return []

def get_user_input():
    """Prompts the user to enter a project ID."""
    return input("Enter the project ID: ")

def create_project_folder(project_id):
    """Creates a project folder based on the provided project ID."""
    project_folder = f"{project_id}"
    os.makedirs(project_folder, exist_ok=True)
    logging.info(f"Project folder created: {project_folder}")
    return project_folder

def filter_entries_for_organism(result_lines, organism, limit=1):
    """Filters UniProt search results for entries matching a specified organism."""
    organism_entries = []
    for line in result_lines:
        if organism in line:
            columns = line.split('\t')
            uniprot_id = columns[0]
            amino_acid_length = columns[6]
            organism_entries.append((uniprot_id, amino_acid_length))

            if len(organism_entries) == limit:
                break
    return organism_entries

def clean_gene_name(gene):
    """Cleans the gene name to remove any non-alphanumeric characters."""
    return re.sub(r'[^A-Za-z0-9]+', '', gene)

def get_protein_sequence_with_fasta(uniprot_id):
    """Retrieves the protein sequence in FASTA format for a given UniProt ID."""
    try:
        u = UniProt()
        fasta_sequence = u.retrieve(uniprot_id, frmt="fasta")
        return fasta_sequence
    except Exception as e:
        logging.error(f"Error fetching sequence for {uniprot_id}: {str(e)}")
        return None

def process_genes(organisms, project_id, genes, project_folder, output_csv_filename):
    """Processes the list of genes, retrieves data from UniProt for all organisms, and writes results to a CSV file."""
    uniprot = UniProt()

    with open(output_csv_filename, "w", newline='', encoding='utf-8') as output_csv:
        csv_writer = csv.writer(output_csv)
        # Add "Organism" column to CSV file
        csv_writer.writerow(["Gene", "Organism", "UniProt ID", "Amino Acid Length", "Sequence"])
        not_found_genes = []

        for gene in genes:
            clean_gene = clean_gene_name(gene)

            for organism in organisms:  # Loop through each organism for the current gene
                result = uniprot.search(f"gene:{clean_gene}", frmt="tsv")

                if result:
                    lines = result.strip().split('\n')
                    organism_entries = filter_entries_for_organism(lines, organism, limit=1)

                    if organism_entries:
                        entry = organism_entries[0]
                        uniprot_id, amino_acid_length = entry
                        fasta_sequence = get_protein_sequence_with_fasta(uniprot_id)

                        if fasta_sequence:
                            # Write gene, organism, uniprot id, and sequence to CSV
                            csv_writer.writerow([clean_gene, organism, uniprot_id, amino_acid_length, fasta_sequence])
                        else:
                            not_found_genes.append((clean_gene, organism))
                    else:
                        not_found_genes.append((clean_gene, organism))
                else:
                    not_found_genes.append((clean_gene, organism))

        # Log and save unavailable genes
        if not_found_genes:
            not_found_csv_filename = f"{project_folder}/Unavailable-Genes-in-Uniprot.csv"
            with open(not_found_csv_filename, "w", newline='', encoding='utf-8') as not_found_csv:
                csv_writer = csv.writer(not_found_csv)
                csv_writer.writerow(["Gene", "Organism"])
                for gene, organism in not_found_genes:
                    csv_writer.writerow([gene, organism])
            logging.info(f"Unavailable genes saved to {not_found_csv_filename}")

def main():
    """Main function to run the project."""
    project_id = get_user_input()
    project_folder = create_project_folder(project_id)

    organisms = get_organism_list()
    genes = get_gene_list()

    if not organisms or not genes:
        logging.error("Gene or organism list is empty. Exiting.")
        return

    UniProt_Details_File = f"{project_folder}/Gene-Uniprot-Data.csv"
    process_genes(organisms, project_id, genes, project_folder, UniProt_Details_File)

    # Read and display results
    logging.info("Sequences retrieved successfully.")

if __name__ == "__main__":
    main()