# Jupyter Notebook to loop (download - process - delete) through all genomes in refseq for a certain domain of life :
# https://ftp.ncbi.nlm.nih.gov/genomes/refseq/

In [20]:
import pandas as pd
import requests
from collections import defaultdict
import gzip
from Bio import SeqIO
import os
import shutil
import math
from collections import Counter
from EntropyHub import ApEn
import numpy as np
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

### Test this notebook with the assembly_summary_test.txt in Google Drive

In [5]:
# Then,read the assembly_summary.txt file
local_directory = '/Users/celia/Desktop/biotokens/ncbi/assembly'
file_name = 'assembly_summary_orgs.txt'
file_path = os.path.join(local_directory, file_name)
df = pd.read_csv(file_path, sep='\t', comment='#')

In [6]:
# Print column names to verify
print("Column names in the dataframe:")
print(df.columns)

# Verify column names
assembly_level_col = 'assembly_level'
ftp_path_col = 'ftp_path'
assembly_accession_col = 'assembly_accession'

Column names in the dataframe:
Index(['assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'refseq_category', 'taxid', 'species_taxid', 'organism_name',
       'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
       'release_type', 'genome_rep', 'seq_rel_date', 'asm_name',
       'asm_submitter', 'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path',
       'excluded_from_refseq', 'relation_to_type_material',
       'asm_not_live_date', 'assembly_type', 'group', 'genome_size',
       'genome_size_ungapped', 'gc_percent', 'replicon_count',
       'scaffold_count', 'contig_count', 'annotation_provider',
       'annotation_name', 'annotation_date', 'total_gene_count',
       'protein_coding_gene_count', 'non_coding_gene_count', 'pubmed_id'],
      dtype='object')


In [7]:


# Function to convert DNA sequence to numerical values
def dna_to_numerical(dna_seq):
    mapping = {'A': 1, 'C': 2, 'G': 3, 'T': 4}
    return [mapping[nuc] for nuc in dna_seq if nuc in mapping]

def shannon_entropy(s):
    # Count the frequency of each symbol in the string
    frequency = Counter(s)
    # Calculate the probability of each symbol
    probabilities = [freq / len(s) for freq in frequency.values()]
    # Calculate the Shannon entropy
    entropy = -sum(p * math.log2(p) for p in probabilities)
    return entropy

# Example usage
string = "aaabbbbcc"
entropy = shannon_entropy(string)
print(f"Shannon entropy of the string '{string}' is: {entropy}")

Shannon entropy of the string 'aaabbbbcc' is: 1.5304930567574824


In [30]:
def read_gffs(gff_file_path, genome_file_path, output_file_path, biotype = 'CDS'):
    # Load genome sequence
    genome_dict = SeqIO.to_dict(SeqIO.parse(genome_file_path, "fasta"))

    # Dictionary to store CDS features by their Parent ID
    biotypes_by_parent = defaultdict(list)

    # Parse the GFF file
    with open(gff_file_path) as file:
        for line in file:
            if not line.startswith("#"):
                parts = line.strip().split('\t')
                if parts[2] == biotype:
                    # Extract relevant data
                    seq_id = parts[0]
                    start = int(parts[3]) - 1  # GFF is 1-based, converting to 0-based
                    end = int(parts[4])
                    strand = parts[6]
                    attributes = parts[8]

                    # Extract Parent ID
                    parent_id = None
                    for attribute in attributes.split(';'):
                        if attribute.startswith("Parent="):
                            parent_id = attribute.split('=')[1]
                            break

                    if parent_id:
                        # Append biotypes feature details
                        biotypes_by_parent[parent_id].append((seq_id, start, end, strand))

    # Function to join CDS sequences
    def join_biotypes(biotypes_list):
        biotypes_list.sort(key=lambda x: x[1])  # Sort by start position
        joined_seq = Seq('')
        sequence_id = ''
        for seq_id, start, end, strand in biotypes_list:
            seq = genome_dict[seq_id].seq[start:end]
            if strand == '-':
                seq = seq.reverse_complement()
            joined_seq += seq
            sequence_id = seq_id
        return joined_seq, sequence_id

    # Join biotypes for each Parent ID
    biotype_records = []
    for parent_id, biotypes_list in biotypes_by_parent.items():
        joined_biotypes_seq, seq_id = join_biotypes(biotypes_list)
        biotype_record = SeqRecord(joined_biotypes_seq, id=seq_id, description=f"biotype: {biotype}")
        biotype_records.append(biotype_record)

    # Save to a FASTA file
    SeqIO.write(biotype_records, output_file, "fasta")
    return True, len(biotype_records)

In [35]:
# Initialize lists to store new columns
sequence_counts = []
ents = []
biotypes = ['CDS', 'ncRNA']
biotypes_sequence_count_dict = {}
# Check if the expected columns exist
if assembly_level_col in df.columns and ftp_path_col in df.columns:
    # Iterate over each row in the dataframe
    for index, row in df.iterrows():
        if row[assembly_level_col] == 'Complete Genome':
            assembly_accession = row[assembly_accession_col]
            biotypes_sequence_count_dict[assembly_accession] = {}
            ftp_path = row[ftp_path_col]
            file_name = ftp_path.split('/')[-1]
            fna_gz = f"{file_name}_genomic.fna.gz"
            gff_gz = f"{file_name}_genomic.gff.gz"

            # Print assembly_accession to track progress
            print(f"Analyzing assembly_accession: {assembly_accession}")

            # Download the gz files
            for file in [fna_gz, gff_gz]:
                local_file_path = os.path.join(local_directory, f"{file.split('.gz')[0]}")
                # Open the stream and decompress on the fly
                with requests.get(f"{ftp_path}/{file}", stream=True) as response:
                    response.raise_for_status()  # Check for request errors
                    with open(local_file_path, 'wb') as output_file:
                        with gzip.GzipFile(fileobj=response.raw) as f:
                            shutil.copyfileobj(f, output_file)
            
            for biotype in biotypes:
                output_file = os.path.join(local_directory, f"{file_name}_joined_{biotype}_sequences.fasta")
                gff_local_path = os.path.join(local_directory, f"{gff_gz.split('.gz')[0]}")
                fna_local_path = os.path.join(local_directory, f"{fna_gz.split('.gz')[0]}")
                result, biotype_sequences_count = read_gffs(gff_local_path, fna_local_path, output_file_path, biotype)
                print(file_name, biotype, result, biotype_sequences_count)
                biotypes_sequence_count_dict[assembly_accession][biotype] = biotype_sequences_count
            # Clean up the downloaded file
            # for file in [fna_gz, gff_gz]:
            #     local_file_path = os.path.join(local_directory, f"{file.split('.gz')[0]}")
            #     os.remove(local_file_path)

            # Append the calculated values to the lists
            sequence_counts.append(sequence_count)
            ents.append(genome_ent)
        else:
            # If the row is not "Complete Genome", skip it
            continue

    # Convert the dictionary to a DataFrame
    dict_df = pd.DataFrame(biotypes_sequence_count_dict).T  # Transpose to align keys as index
    dict_df.index.name = 'assembly_accession'  # Set the index name to match the key column in df
    dict_df.reset_index(inplace=True)  # Convert index to a column

    # Filter the dataframe to only include rows with "Complete Genome"
    df_complete_genome = df[df[assembly_level_col] == 'Complete Genome'].copy()

    # Add the new columns to the filtered dataframe
    # Merge the original DataFrame with the new DataFrame
    merged_df = pd.merge(df_complete_genome, dict_df, on='assembly_accession', how='left')

    # Save the filtered dataframe to an Excel file
    output_file_path = '/Users/celia/Desktop/biotokens/ncbi/assembly/ncbi_assembly_genom.xlsx'
    merged_df.to_excel(output_file_path, index=False)

    print(f"Excel file created: {output_file_path}")
else:
    print(f"Expected columns '{assembly_level_col}' and/or '{ftp_path_col}' not found in the dataframe.")

Analyzing assembly_accession: GCF_000005845.2
GCF_000005845.2_ASM584v2 CDS True 4305
GCF_000005845.2_ASM584v2 ncRNA True 98
Analyzing assembly_accession: GCF_000006805.1
GCF_000006805.1_ASM680v1 CDS True 2697
GCF_000006805.1_ASM680v1 ncRNA True 0
Analyzing assembly_accession: GCF_000002765.6
GCF_000002765.6_GCA_000002765 CDS True 5354
GCF_000002765.6_GCA_000002765 ncRNA True 55
Analyzing assembly_accession: GCF_000146045.2
GCF_000146045.2_R64 CDS True 6027
GCF_000146045.2_R64 ncRNA True 22
Excel file created: /Users/celia/Desktop/biotokens/ncbi/assembly/ncbi_assembly_genom.xlsx
