# Jupyter Notebook to loop (download - process - delete) through all genomes in refseq for a certain domain of life :
# https://ftp.ncbi.nlm.nih.gov/genomes/refseq/

In [9]:
import pandas as pd
import requests
import gzip
from Bio import SeqIO
import os
import math
from collections import Counter
from EntropyHub import ApEn
import numpy as np

### Test this notebook with the assembly_summary_test.txt in Google Drive

In [10]:
# Then,read the assembly_summary.txt file
file_path = '/Users/celia/Desktop/genomes/test/assembly_summary_test.txt'
df = pd.read_csv(file_path, sep='\t', comment='#')

In [11]:
# Print column names to verify
print("Column names in the dataframe:")
print(df.columns)

# Verify column names
assembly_level_col = 'assembly_level'
ftp_path_col = 'ftp_path'
assembly_accession_col = 'assembly_accession'

Column names in the dataframe:
Index(['assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'refseq_category', 'taxid', 'species_taxid', 'organism_name',
       'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
       'release_type', 'genome_rep', 'seq_rel_date', 'asm_name',
       'asm_submitter', 'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path',
       'excluded_from_refseq', 'relation_to_type_material',
       'asm_not_live_date', 'assembly_type', 'group', 'genome_size',
       'genome_size_ungapped', 'gc_percent', 'replicon_count',
       'scaffold_count', 'contig_count', 'annotation_provider',
       'annotation_name', 'annotation_date', 'total_gene_count',
       'protein_coding_gene_count', 'non_coding_gene_count', 'pubmed_id'],
      dtype='object')


In [12]:
def shannon_entropy(s):
    # Count the frequency of each symbol in the string
    frequency = Counter(s)
    # Calculate the probability of each symbol
    probabilities = [freq / len(s) for freq in frequency.values()]
    # Calculate the Shannon entropy
    entropy = -sum(p * math.log2(p) for p in probabilities)
    return entropy

# Example usage
string = "aaabbbbcc"
entropy = shannon_entropy(string)
print(f"Shannon entropy of the string '{string}' is: {entropy}")

Shannon entropy of the string 'aaabbbbcc' is: 1.5304930567574824


In [13]:
# Initialize lists to store new columns
lengths = []
ents = []

# Function to convert DNA sequence to numerical values
def dna_to_numerical(dna_seq):
    mapping = {'A': 1, 'C': 2, 'G': 3, 'T': 4}
    return [mapping[nuc] for nuc in dna_seq if nuc in mapping]

# Check if the expected columns exist
if assembly_level_col in df.columns and ftp_path_col in df.columns:
    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        if row[assembly_level_col] == 'Complete Genome':
            assembly_accession = row[assembly_accession_col]
            ftp_path = row[ftp_path_col]
            file_name = ftp_path.split('/')[-1]
            gz_file_url = f"{ftp_path}/{file_name}_genomic.fna.gz"

            # Print assembly_accession to track progress
            print(f"Analyzing assembly_accession: {assembly_accession}")

            # Download the gz file
            response = requests.get(gz_file_url, stream=True)
            gz_file_path = f"{file_name}_genomic.fna.gz"
            with open(gz_file_path, 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

            # Calculate the length of the genome
            genome_length = 0
            genome_ent = 0
            numerical_sequence = []
            with gzip.open(gz_file_path, 'rt') as gz_file:
                for record in SeqIO.parse(gz_file, 'fasta'):
                    genome_length += len(record.seq)
                    genome_seq = record.seq
                    numerical_sequence.extend(dna_to_numerical(record.seq))
                    genome_ent = shannon_entropy(numerical_sequence)
            # Clean up the downloaded file
            os.remove(gz_file_path)

            # Append the calculated values to the lists
            lengths.append(genome_length)
            ents.append(genome_ent)
        else:
            # If the row is not "Complete Genome", skip it
            continue

    # Filter the dataframe to only include rows with "Complete Genome"
    df_complete_genome = df[df[assembly_level_col] == 'Complete Genome'].copy()

    # Add the new columns to the filtered dataframe
    df_complete_genome['length'] = lengths
    df_complete_genome['Shannon'] = ents

    # Save the filtered dataframe to an Excel file
    output_file_path = '/Users/celia/Desktop/genomes/test/assembly_summary_with_lengths_and_entropy.xlsx'
    df_complete_genome.to_excel(output_file_path, index=False)

    print(f"Excel file created: {output_file_path}")
else:
    print(f"Expected columns '{assembly_level_col}' and/or '{ftp_path_col}' not found in the dataframe.")

Analyzing assembly_accession: GCF_900128725.1
Analyzing assembly_accession: GCF_017068195.1
Excel file created: /Users/celia/Desktop/genomes/test/assembly_summary_with_lengths_and_entropy.xlsx


## Once everything is working, you can download full assembly summary files from NCBI.
## For example, for bacteria:

## wget ftp://ftp.ncbi.nih.gov/genomes/refseq/bacteria/assembly_summary.txt