## Bioinformatics to analyze tad/Cpa genes

In [1]:


from Bio import Entrez, SeqIO
import pandas as pd 
import re

In [4]:
# Set your email address for compliance
Entrez.email = "matttheodore@tamu.edu"

# Define the RefSeq accession number for the genome of interest
refseq_accession = "NC_011916"  # Replace with the actual RefSeq accession number

# Fetch the entire GenBank file using the RefSeq accession number
handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=refseq_accession)
genbank_data = handle.read()
handle.close()

# Save the GenBank data to a file
with open("genome_genbank_file.gb", "w") as file:
    file.write(genbank_data)

In [3]:
# Set your email address for compliance
Entrez.email = "matttheodore@tamu.edu"

# Define the organism name for the genome of interest
organism_name = "Caulobacter vibrioides NA1000"

# Search for the organism and retrieve its genome data
search_query = f"{organism_name}[Organism] AND complete genome[Title]"
handle = Entrez.esearch(db="nucleotide", term=search_query, retmax=1)
record = Entrez.read(handle)
if record["Count"] == "0":
    print(f"No genome found for {organism_name}.")
    exit()
genome_accession = record["IdList"][0]
handle.close()

# Fetch the genome data using the accession number
handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=genome_accession)
genome_record = SeqIO.read(handle, "genbank")
handle.close()

In [6]:
# Set your email address for compliance
Entrez.email = "matttheodore@tamu.edu"


# Initialize empty lists to store gene names that meet and do not meet the criteria
gene_names_meet_criteria = []
gene_names_do_not_meet_criteria = []

# Search for genes with names matching the pattern "cpa" followed by any letter
gene_pattern = re.compile(r'cpa.*', re.IGNORECASE)

for feature in genome_record.features:
    if feature.type == "CDS":
        gene_name = feature.qualifiers.get("gene", ["Unknown"])[0]
        if gene_pattern.match(gene_name):
            # Gene names that meet the criteria
            gene_names_meet_criteria.append(gene_name)
        else:
            # Gene names that do not meet the criteria
            gene_names_do_not_meet_criteria.append(gene_name)

# Create dataframes for the lists of gene names
meet_criteria_df = pd.DataFrame({"Gene Names": gene_names_meet_criteria})
do_not_meet_criteria_df = pd.DataFrame({"Gene Names": gene_names_do_not_meet_criteria})

# Print the dataframes or lists
print("Gene Names that Meet the Criteria:")
print(meet_criteria_df)

print("\nGene Names that Do Not Meet the Criteria:")
print(do_not_meet_criteria_df)



Gene Names that Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []

Gene Names that Do Not Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []


In [9]:


# Initialize empty lists to store gene information
gene_data = []

# Search for genes with names matching the pattern "Cpa" followed by any letter
gene_pattern = re.compile(r'cpa.*', re.IGNORECASE)

for feature in genome_record.features:
    if feature.type == "CDS":
        gene_name = feature.qualifiers.get("gene", ["Unknown"])[0]
        if gene_pattern.match(gene_name):
            location = str(feature.location)
            locus_tag = feature.qualifiers.get('locus_tag', ['Unknown'])[0]
            product = feature.qualifiers.get('product', ['Unknown'])[0]
            function = feature.qualifiers.get('function', ['Unknown'])[0]
            phenotype = feature.qualifiers.get('phenotype', ['Unknown'])[0]
            regulatory_class = feature.qualifiers.get('regulatory_class', ['Unknown'])[0]
            note = feature.qualifiers.get('note', ['Unknown'])[0]
            
            # Append gene information to the list
            gene_data.append([gene_name, location, locus_tag, product, function, phenotype, regulatory_class, note])


# Create a DataFrame from the collected gene data
column_names = ["Gene", "Location", "Locus Tag", "Product", "Function", "Phenotype", "Regulatory Class", "Note"]
gene_df = pd.DataFrame(gene_data, columns=column_names)

# Print the DataFrame
print(gene_df)




Empty DataFrame
Columns: [Gene, Location, Locus Tag, Product, Function, Phenotype, Regulatory Class, Note]
Index: []


In [1]:
import re
import pandas as pd
from Bio import Entrez, SeqIO

# Set your email address for compliance
Entrez.email = "matttheodore@tamu.edu"

# Define the organism name for the genome of interest
organism_name = "Caulobacter vibrioides NA1000"

# Search for the organism and retrieve its genome data
search_query = f"{organism_name}[Organism] AND complete genome[Title]"
handle = Entrez.esearch(db="nucleotide", term=search_query, retmax=1)
record = Entrez.read(handle)
if record["Count"] == "0":
    print(f"No genome found for {organism_name}.")
    exit()
genome_accession = record["IdList"][0]
handle.close()

# Fetch the genome data using the accession number
handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=genome_accession)
genome_record = SeqIO.read(handle, "genbank")
handle.close()

# Initialize empty lists to store gene names that meet and do not meet the criteria
gene_names_meet_criteria = []
gene_names_do_not_meet_criteria = []

# Search for genes with names matching the pattern "cpa" or "tad"
gene_pattern = re.compile(r'.*(cpa|tad).*', re.IGNORECASE)

for feature in genome_record.features:
    try:
        if feature.type in ["CDS", "gene"]:
            # Some features might not have a gene name
            gene_name = feature.qualifiers.get("gene", ["Unknown"])[0]
            if gene_pattern.match(gene_name):
                gene_names_meet_criteria.append(gene_name)
            else:
                gene_names_do_not_meet_criteria.append(gene_name)
    except Exception as e:
        print(f"Error processing feature: {e}")

# Create dataframes for the lists of gene names
meet_criteria_df = pd.DataFrame({"Gene Names": gene_names_meet_criteria})
do_not_meet_criteria_df = pd.DataFrame({"Gene Names": gene_names_do_not_meet_criteria})

# Print the dataframes or lists
print("Gene Names that Meet the Criteria:")
print(meet_criteria_df)

print("\nGene Names that Do Not Meet the Criteria:")
print(do_not_meet_criteria_df)




Gene Names that Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []

Gene Names that Do Not Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []


In [2]:
## With more print statements 

import re
import pandas as pd
from Bio import Entrez, SeqIO

# Set your email address for compliance
Entrez.email = "matttheodore@tamu.edu"

# Define the organism name for the genome of interest
organism_name = "Caulobacter vibrioides NA1000"

# Search for the organism and retrieve its genome data
search_query = f"{organism_name}[Organism] AND complete genome[Title]"
handle = Entrez.esearch(db="nucleotide", term=search_query, retmax=1)
record = Entrez.read(handle)
if record["Count"] == "0":
    print(f"No genome found for {organism_name}.")
    exit()
genome_accession = record["IdList"][0]
handle.close()

# Fetch the genome data using the accession number
handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=genome_accession)
genome_record = SeqIO.read(handle, "genbank")
handle.close()

# Print basic genome information
print(f"Genome Accession: {genome_accession}")
print(f"Genome Size: {len(genome_record)}")

# Initialize empty lists to store gene names
gene_names_meet_criteria = []
gene_names_do_not_meet_criteria = []

# Search for genes with names matching the pattern "cpa" or "tad"
gene_pattern = re.compile(r'.*(cpa|tad).*', re.IGNORECASE)

for feature in genome_record.features:
    try:
        # Print each feature type for debugging
        print(f"Processing feature type: {feature.type}")

        if feature.type in ["CDS", "gene"]:
            gene_name = feature.qualifiers.get("gene", ["Unknown"])[0]

            # Print each gene name for debugging
            print(f"Gene Name: {gene_name}")

            if gene_pattern.match(gene_name):
                gene_names_meet_criteria.append(gene_name)
            else:
                gene_names_do_not_meet_criteria.append(gene_name)
    except Exception as e:
        print(f"Error processing feature: {e}")

# Create dataframes for the lists of gene names
meet_criteria_df = pd.DataFrame({"Gene Names": gene_names_meet_criteria})
do_not_meet_criteria_df = pd.DataFrame({"Gene Names": gene_names_do_not_meet_criteria})

# Print the dataframes or lists
print("Gene Names that Meet the Criteria:")
print(meet_criteria_df)

print("\nGene Names that Do Not Meet the Criteria:")
print(do_not_meet_criteria_df)


Genome Accession: 221232939
Genome Size: 4042929
Processing feature type: source
Gene Names that Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []

Gene Names that Do Not Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []


In [4]:
## Prints to understand what is going on 
import re
import pandas as pd
from Bio import Entrez, SeqIO

# Set your email address for compliance
Entrez.email = "matttheodore@tamu.edu"

# Define the organism name for the genome of interest
organism_name = "Caulobacter vibrioides NA1000"

# Search for the organism and retrieve its genome data
search_query = f"{organism_name}[Organism] AND complete genome[Title]"
handle = Entrez.esearch(db="nucleotide", term=search_query, retmax=1)
record = Entrez.read(handle)
if record["Count"] == "0":
    print(f"No genome found for {organism_name}.")
    exit()
genome_accession = record["IdList"][0]
handle.close()

# Fetch the genome data using the accession number
handle = Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=genome_accession)
genome_record = SeqIO.read(handle, "genbank")
handle.close()

# Print basic genome information
print(f"Genome Accession: {genome_accession}")
print(f"Genome Size: {len(genome_record)}")

# Initialize empty lists to store gene names
gene_names_meet_criteria = []
gene_names_do_not_meet_criteria = []

# Search for genes with names matching the pattern "cpa" or "tad"
gene_pattern = re.compile(r'.*(cpa|tad).*', re.IGNORECASE)

# Counter for debugging
feature_count = 0

for feature in genome_record.features:
    feature_count += 1

    try:
        # Print each feature type for debugging
        print(f"Processing feature {feature_count}: {feature.type}")

        if feature.type in ["CDS", "gene"]:
            gene_name = feature.qualifiers.get("gene", ["Unknown"])[0]
            print(f"Gene Name: {gene_name}")

            if gene_pattern.match(gene_name):
                gene_names_meet_criteria.append(gene_name)
            else:
                gene_names_do_not_meet_criteria.append(gene_name)

        # Print qualifiers for the first few features for inspection
        if feature_count <= 5:
            print(f"Qualifiers for feature {feature_count}: {feature.qualifiers}")

    except Exception as e:
        print(f"Error processing feature {feature_count}: {e}")

# Print the dataframes or lists
print("Gene Names that Meet the Criteria:")
print(meet_criteria_df)

print("\nGene Names that Do Not Meet the Criteria:")
print(do_not_meet_criteria_df)



Genome Accession: 221232939
Genome Size: 4042929
Processing feature 1: source
Qualifiers for feature 1: OrderedDict([('organism', ['Caulobacter vibrioides NA1000']), ('mol_type', ['genomic DNA']), ('strain', ['NA1000']), ('db_xref', ['taxon:565050'])])
Gene Names that Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []

Gene Names that Do Not Meet the Criteria:
Empty DataFrame
Columns: [Gene Names]
Index: []
