In [10]:
from Bio import Entrez
import pandas as pd
import numpy as np
import ftplib
import os 

# Set your email (required by NCBI Entrez)
Entrez.email = "Your.Name.Here@example.org"

# Step 1: Search for assemblies related to "Pleurotus"
record = Entrez.read(Entrez.esearch(db="assembly", term="Pleurotus", retmax=44)) # i have checked that there are 44 records

# Step 2: Fetch summaries for all assembly IDs
ids = ",".join(record['IdList'])
summaries = Entrez.read(Entrez.esummary(db="assembly", id=ids))

# Step 3: Prepare to extract the relevant fields into a list
assembly_data = []

# Step 4: Loop through all summaries to extract the required fields
for summary in summaries['DocumentSummarySet']['DocumentSummary']:
    
    # Extract relevant fields
    accession = summary.get('AssemblyAccession', 'N/A')
    assembly_name = summary.get('AssemblyName', 'N/A')
    organism = summary.get('Organism', 'N/A')
    species_taxid = summary.get('SpeciesTaxid', 'N/A')
    
    # Extract Infraspecies Sub_value from Biosource → InfraspeciesList
    sub_value = 'N/A'
    infraspecies_list = summary.get('Biosource', {}).get('InfraspeciesList', [])
    if infraspecies_list:
        sub_value = infraspecies_list[0].get('Sub_value', 'N/A')
        

    annotation = 0
    reference = 0
    property_list = summary.get('PropertyList', [])
    
    if "has_annotation" in property_list:
        annotation = 1
    if "reference" in property_list:
        reference = 1
        
    submitter_organization = summary.get('SubmitterOrganization', 'N/A')

    # Extract Meta information
    meta_data = summary.get('Meta', '')
    total_length = 'N/A'
    chromosome_count = 'N/A'
    scaffold_count = 'N/A'
    
    if meta_data:
        total_length = str(round(float(meta_data.split("<Stat category=\"total_length\" sequence_tag=\"all\">")[1].split("</Stat>")[0]) / 1000000, 2))
        chromosome_count = meta_data.split("<Stat category=\"chromosome_count\" sequence_tag=\"all\">")[1].split("</Stat>")[0]
        scaffold_count = meta_data.split("<Stat category=\"scaffold_count\" sequence_tag=\"all\">")[1].split("</Stat>")[0]
    
    assembly_status = summary.get('AssemblyStatus', 'N/A')
    seq_release_date = summary.get('SeqReleaseDate', 'N/A')
    wgs = summary.get('WGS', 'N/A')
    
    # Extract contig/scaffold N50 values
    contig_n50 = summary.get('ContigN50', 'N/A')
    scaffold_n50 = summary.get('ScaffoldN50', 'N/A')
    
    # Extract BioProject and BioSample accession numbers
    bioproject = summary.get('GB_BioProjects', [{}])[0].get('BioprojectAccn', 'N/A')
    biosample = summary.get('BioSampleAccn', 'N/A')
    
    # Append the extracted data to the list
    assembly_data.append([
        accession, assembly_name, organism, species_taxid, sub_value, annotation, reference,
        submitter_organization, total_length, chromosome_count, scaffold_count, 
        assembly_status, seq_release_date, wgs, contig_n50, scaffold_n50, 
        bioproject, biosample
    ])

# Step 5: Convert the list into a DataFrame
columns = [
    "Assembly Accession", "Assembly Name", "Organism", "Species Taxid", "Strain",
    "Annotation", "Reference", "Submitter Organization", "Total Length (Mb)", "Chromosome Count", 
    "Scaffold Count", "Assembly Status", "Seq Release Date", "WGS", "Contig N50", 
    "Scaffold N50", "BioProject Accn", "BioSample Accn"
]


df = pd.DataFrame(assembly_data, columns=columns) #convert the list into a DataFrame

df['Assembly ID'] = record['IdList'] #add column record['IdList']
df = df[['Assembly ID'] + [col for col in df.columns if col != 'Assembly ID']] # Reorder columns to put 'Assembly ID' first
df = df.sort_values(by='Species Taxid') # Sort the DataFrame by 'Species Taxid'

df = df.drop_duplicates(subset='Assembly Accession')



comment: i tried to parse the fasta files for each assembly w/ `epost` and `efetch` but i couldn't, so i stop trying. additionally, from `https://www.biostars.org/p/141581/` : it seems to me that the assembly database may not be fully implemented. If you look at the EFetch help, there aren't any options for db=assembly

In [11]:
display(df)

Unnamed: 0,Assembly ID,Assembly Accession,Assembly Name,Organism,Species Taxid,Strain,Annotation,Reference,Submitter Organization,Total Length (Mb),Chromosome Count,Scaffold Count,Assembly Status,Seq Release Date,WGS,Contig N50,Scaffold N50,BioProject Accn,BioSample Accn
0,21541041,GCA_036873075.1,ASM3687307v1,Pleurotus giganteus (basidiomycete fungi),1156456,zhudugu2,0,0,Shanghai Academy of Agricultural Sciences,40.04,0,27,Contig,2024/02/26 00:00,JAYMYW01,2610276,2610276,PRJNA1062731,SAMN39313592
3,18902031,GCA_032158425.1,ASM3215842v1,Pleurotus giganteus (basidiomycete fungi),1156456,PG46,0,1,Chinese Academy of Tropical Agricultural Sciences,40.07,0,16,Contig,2023/09/28 00:00,JAPEHR01,2887167,2887167,PRJNA896531,SAMN31552355
31,1824831,GCA_003313735.1,ASM331373v1,Pleurotus platypus (basidiomycete fungi),2015914,MG11,0,1,Kunming University of Science and Technology,39.97,0,6484,Scaffold,2018/07/11 00:00,QLOV01,59239,62119,PRJNA454572,SAMN09010680
28,2828631,GCA_005298045.1,ASM529804v1,Pleurotus ostreatoroseus (basidiomycete fungi),2048520,DPUA 1720,1,0,Amazonian macromicets: knowing their diversity...,38.59,0,619,Contig,2019/05/13 00:00,SWBT01,180995,180995,PRJNA507735,SAMN10503127
2,18902041,GCA_032158505.1,ASM3215850v1,Pleurotus pulmonarius (basidiomycete fungi),28995,CCMSSC 04423,0,0,Zhejiang Academy of Agricultural Sciences,42.32,0,111,Contig,2023/09/28 00:00,VUOF01,2663750,2663750,PRJNA561278,SAMN08055039
25,7064321,GCA_012979565.1,PM_ss13_v1,Pleurotus pulmonarius (basidiomycete fungi),28995,PM_ss13,1,0,Academia Sinica,42.55,0,17,Contig,2020/05/07 00:00,SJDA01,3242126,3242126,PRJNA523384,SAMN10975410
24,7064341,GCA_012980525.1,ASM1298052v1,Pleurotus pulmonarius (basidiomycete fungi),28995,PM_ss2,1,0,Academia Sinica,39.24,0,23,Contig,2020/05/07 00:00,SJKE01,3175356,3175356,PRJNA523385,SAMN10975466
23,7064351,GCA_012980535.1,ASM1298053v1,Pleurotus pulmonarius (basidiomycete fungi),28995,PM_ss5,1,1,Academia Sinica,39.87,0,23,Contig,2020/05/07 00:00,SJKF01,3374720,3374720,PRJNA523390,SAMN10975498
8,16437751,GCA_029747585.1,Pleurotus_djamor_reassembly_enz,Pleurotus djamor (basidiomycete fungi),34470,MPG-05,1,1,TU Dresden - IHI Zittau,62.54,0,2603,Scaffold,2023/04/14 00:00,JAPEVF01,51933,52533,PRJNA896869,SAMN31571033
17,10737191,GCA_019677325.1,ASM1967732v1,Pleurotus cornucopiae (cornucopia mushroom),5321,,1,0,Institute of Agricultural Resources and Region...,35.08,0,425,Scaffold,2021/08/18 00:00,WQMT01,186309,331558,PRJNA593973,SAMN13497034


#Calculate Z-scores for 'Total Length (Mb)' and filter out rows with Z-scores
df['Total Length (Mb)'] = pd.to_numeric(df['Total Length (Mb)'], errors='coerce') #convert to numeric, force errors to NaN
mean, std_dev = df['Total Length (Mb)'].mean(), df['Total Length (Mb)'].std() #calc mean and std_dev
df['Z-Score'] = (df['Total Length (Mb)'] - mean) / std_dev #calc z-scores
df = df[df['Z-Score'].abs() < 1].drop(columns='Z-Score') #Filter out rows where the Z-score



In [2]:
df_weighted = df.copy()

# Step 1: Map 'Annotation' to 'annotation_weight'
annotation_mapping = {0: 1, 1: 2}
df_weighted['annotation_weight'] = df_weighted['Annotation'].map(annotation_mapping)

# Step 2: Map 'Reference' to 'reference_weight'
reference_mapping = {0: 1, 1: 2}
df_weighted['reference_weight'] = df_weighted['Reference'].map(reference_mapping)

# Step 3: Map 'Assembly Status' to 'status_weight'
status_mapping = {"Contig": 1, "Scaffold": 2, "Chromosome": 3}
df_weighted['status_weight'] = df_weighted['Assembly Status'].map(status_mapping)

# Step 4: Create 'total_weight' column
df_weighted['total_weight'] = (
    100 * df_weighted['annotation_weight'] +
    10 * df_weighted['reference_weight'] +
    1 * df_weighted['status_weight']
)

# Split df_weighted into groups based on 'Species Taxid' and keep the row with the highest 'total_weight' value for each group
df_keep = df_weighted.loc[df_weighted.groupby('Species Taxid')['total_weight'].idxmax()]

# Display the resulting DataFrame
display(df_keep)



# brief xplanation of the "weighted" thing: we have 44 "pleurotus" assemblies, and we want to keep only one per species. 
# so we assign a weight to each assembly based on the presence of annotation, reference, and the assembly status.
# we then keep the assembly with the highest weight for each species.
# "weight logic": annotation = 100, reference = 10, status = 1. 
# check obsidian canvas named "2210" on daily folder for more details.



Unnamed: 0,Assembly ID,Assembly Accession,Assembly Name,Organism,Species Taxid,Strain,Annotation,Reference,Submitter Organization,Total Length (Mb),...,Seq Release Date,WGS,Contig N50,Scaffold N50,BioProject Accn,BioSample Accn,annotation_weight,reference_weight,status_weight,total_weight
3,18902031,GCA_032158425.1,ASM3215842v1,Pleurotus giganteus (basidiomycete fungi),1156456,PG46,0,1,Chinese Academy of Tropical Agricultural Sciences,40.07,...,2023/09/28 00:00,JAPEHR01,2887167,2887167,PRJNA896531,SAMN31552355,1,2,1,121
31,1824831,GCA_003313735.1,ASM331373v1,Pleurotus platypus (basidiomycete fungi),2015914,MG11,0,1,Kunming University of Science and Technology,39.97,...,2018/07/11 00:00,QLOV01,59239,62119,PRJNA454572,SAMN09010680,1,2,2,122
28,2828631,GCA_005298045.1,ASM529804v1,Pleurotus ostreatoroseus (basidiomycete fungi),2048520,DPUA 1720,1,0,Amazonian macromicets: knowing their diversity...,38.59,...,2019/05/13 00:00,SWBT01,180995,180995,PRJNA507735,SAMN10503127,2,1,1,211
23,7064351,GCA_012980535.1,ASM1298053v1,Pleurotus pulmonarius (basidiomycete fungi),28995,PM_ss5,1,1,Academia Sinica,39.87,...,2020/05/07 00:00,SJKF01,3374720,3374720,PRJNA523390,SAMN10975498,2,2,1,221
8,16437751,GCA_029747585.1,Pleurotus_djamor_reassembly_enz,Pleurotus djamor (basidiomycete fungi),34470,MPG-05,1,1,TU Dresden - IHI Zittau,62.54,...,2023/04/14 00:00,JAPEVF01,51933,52533,PRJNA896869,SAMN31571033,2,2,2,222
15,10778211,GCA_019677325.2,ASM1967732v2,Pleurotus cornucopiae (cornucopia mushroom),5321,,1,1,Institute of Agricultural Resources and Region...,32.38,...,2021/08/25 00:00,WQMT02,215546,3564128,PRJNA593973,SAMN13497034,2,2,3,223
21,8023241,GCF_014466165.1,ASM1446616v1,Pleurotus ostreatus (oyster mushroom),5322,PC9,1,1,Academia Sinica,34.97,...,2020/09/03 00:00,JACETU01,3500734,3500734,PRJNA647232,SAMN15594696,2,2,1,221
20,8650031,GCA_015484515.1,Pleery1,Pleurotus eryngii (basidiomycete fungi),5323,ATCC:90797,1,0,DOE Joint Genome Institute,44.61,...,2020/11/15 00:00,JADNRH01,87948,241626,PRJNA333100,SAMN05445980,2,1,2,212
39,1344531,GCA_002583695.1,PsmonoN60v_1.0,Pleurotus salmoneostramineus (basidiomycete fu...,64638,N60,0,1,"Applied Bioscience, Kindai University, Faculty...",39.19,...,2017/10/04 00:00,BEWF01,104923,108301,PRJDB6242,SAMD00092845,1,2,2,122
22,7814131,GCA_014058305.1,ASM1405830v1,Pleurotus tuber-regium (basidiomycete fungi),716892,ACCC 50657-18,0,1,Huazhong Agricultural University,35.82,...,2020/08/04 00:00,JACFYU01,2556151,2556151,PRJNA644065,SAMN15447141,1,2,1,121


In [8]:
# give me target_assembly_id and i will list the files in the FTP directory

# Define the target Assembly ID
target_assembly_id = "GCA_005298045.1"  # Replace with your desired Assembly ID

# Step 1: Loop through the summaries and find the target Assembly ID
for summary in summaries["DocumentSummarySet"]["DocumentSummary"]:
    if summary['AssemblyAccession'] == target_assembly_id:
        assembly_name = summary['AssemblyName']
        ftp_path = summary.get('FtpPath_GenBank', None)

        if ftp_path:
            print(f"Listing files for Assembly: {assembly_name} (ID: {target_assembly_id})")

            # Extract the directory path from the FTP link
            ftp_directory = ftp_path.split("ftp://ftp.ncbi.nlm.nih.gov")[1]  # Get the relative directory

            try:
                # Connect to the NCBI FTP server
                ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
                ftp.login()  # Log in as an anonymous user

                # Navigate to the directory where FASTA files are located
                ftp.cwd(ftp_directory)

                # List the files in the directory
                files = ftp.nlst()

                # Print the available files
                print(f"Files available in {ftp_directory}:")
                for file in files:
                    print(f" - {file}")

            except ftplib.error_perm as e:
                print(f"Error accessing FTP directory for {assembly_name}: {e}")

            # Close the FTP connection
            ftp.quit()
            break  # Exit the loop after finding the assembly

else:
    print(f"Assembly ID {target_assembly_id} not found.")


Listing files for Assembly: ASM529804v1 (ID: GCA_005298045.1)
Files available in /genomes/all/GCA/005/298/045/GCA_005298045.1_ASM529804v1:
 - annotation_hashes.txt
 - README.txt
 - GCA_005298045.1_ASM529804v1_assembly_report.txt
 - GCA_005298045.1_ASM529804v1_assembly_stats.txt
 - GCA_005298045.1_ASM529804v1_cds_from_genomic.fna.gz
 - GCA_005298045.1_ASM529804v1_feature_count.txt.gz
 - GCA_005298045.1_ASM529804v1_feature_table.txt.gz
 - GCA_005298045.1_ASM529804v1_genomic.fna.gz
 - GCA_005298045.1_ASM529804v1_genomic.gbff.gz
 - GCA_005298045.1_ASM529804v1_genomic.gff.gz
 - GCA_005298045.1_ASM529804v1_genomic.gtf.gz
 - GCA_005298045.1_ASM529804v1_protein.faa.gz
 - GCA_005298045.1_ASM529804v1_protein.gpff.gz
 - GCA_005298045.1_ASM529804v1_rna_from_genomic.fna.gz
 - GCA_005298045.1_ASM529804v1_translated_cds.faa.gz
 - GCA_005298045.1_ASM529804v1_wgsmaster.gbff.gz
 - assembly_status.txt
 - ani_rpts_removed.txt
 - uncompressed_checksums.txt
 - md5checksums.txt
 - GCA_005298045.1_ASM529804v1

In [9]:
# code chunk access ftp path for a specific assembly id
# creates a folder 'data' in pwd
# creates a folder with the organism name in 'data' folder
# downloads the "*protein.faa.gz" file and "*genomic.fna.gz* file in the organism folder
# for genomic.fna.gz("AssemblyAccession_AssemblyName_genomic.fna.gz", bc there more "*genomic.fna.gz" eding files in some paths)

import ftplib
import os

# Define the target Assembly ID
target_assembly_id = "GCA_005298045.1"  # Replace with your desired Assembly ID

# Step 3: Loop through the summaries and find the target Assembly ID
ftp_path = None
organism = None

for summary in summaries["DocumentSummarySet"]["DocumentSummary"]:
    if summary['AssemblyAccession'] == target_assembly_id:
        ftp_path = summary.get('FtpPath_GenBank', None)
        assembly_name = summary['AssemblyName']
        organism = summary.get('Organism', 'N/A').replace(" ", "_")  # Replace spaces with underscores for folder names

        if ftp_path:
            print(f"Found FTP path for Assembly: {assembly_name} (ID: {target_assembly_id}) - {ftp_path}")
            break
else:
    print(f"Assembly ID {target_assembly_id} not found.")
    exit()  # Exit if the assembly ID was not found

# Step 4: Extract the relative directory from the FTP link
ftp_directory = ftp_path.split("ftp://ftp.ncbi.nlm.nih.gov")[1]

# Create the 'pleurotus_genomes` folder and organism-specific folder if they don't exist
if not os.path.exists('pleurotus_genomes'):
    os.makedirs('pleurotus_genomes')

organism_folder = os.path.join('pleurotus_genomes', organism)
if not os.path.exists(organism_folder):
    os.makedirs(organism_folder)

# Step 5: Connect to the NCBI FTP server and list files
try:
    # Connect to the NCBI FTP server
    ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
    ftp.login()  # Log in as an anonymous user
    
    # Navigate to the directory
    ftp.cwd(ftp_directory)
    
    # List the files in the directory
    files = ftp.nlst()

    # Initialize variables to track found files
    protein_faa_file = None
    genomic_fna_file = None
    
    # Check for the required files
    for file in files:
        if file.endswith("protein.faa.gz"):
            protein_faa_file = file
            print(f"File found: {protein_faa_file}")
        elif file.endswith("genomic.fna.gz"):
            genomic_fna_file = file
            print(f"File found: {genomic_fna_file}")

    # Download the protein.faa.gz file if found
    if protein_faa_file:
        local_filename = os.path.join(organism_folder, protein_faa_file.split('/')[-1])  # Save in organism folder
        with open(local_filename, "wb") as local_file:
            ftp.retrbinary(f"RETR {protein_faa_file}", local_file.write)
            print(f"Downloaded: {local_filename}")
    else:
        print("No file ending with 'protein.faa.gz' found.")

    # Download the genomic.fna.gz file if found, and rename it
    if genomic_fna_file:
        genomic_fna_filename = f"{target_assembly_id}_{assembly_name}_genomic.fna.gz"
        local_filename = os.path.join(organism_folder, genomic_fna_filename)  # Save in organism folder
        with open(local_filename, "wb") as local_file:
            ftp.retrbinary(f"RETR {genomic_fna_file}", local_file.write)
            print(f"Downloaded and renamed to: {local_filename}")
    else:
        print("No file ending with 'genomic.fna.gz' found.")

except ftplib.error_perm as e:
    print(f"FTP error: {e}")

finally:
    # Close the FTP connection
    ftp.quit()

Found FTP path for Assembly: ASM529804v1 (ID: GCA_005298045.1) - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/005/298/045/GCA_005298045.1_ASM529804v1
File found: GCA_005298045.1_ASM529804v1_cds_from_genomic.fna.gz
File found: GCA_005298045.1_ASM529804v1_genomic.fna.gz
File found: GCA_005298045.1_ASM529804v1_protein.faa.gz
File found: GCA_005298045.1_ASM529804v1_rna_from_genomic.fna.gz
Downloaded: data/Pleurotus_ostreatoroseus_(basidiomycete_fungi)/GCA_005298045.1_ASM529804v1_protein.faa.gz
Downloaded and renamed to: data/Pleurotus_ostreatoroseus_(basidiomycete_fungi)/GCA_005298045.1_ASM529804v1_genomic.fna.gz


In [None]:
# next steps
# 1. checked "pleurotus" in mycocosm. nothing afformentioned assembly ids are there.
# 2. if nothing new in these dbs, then add a loop for the assemblies we keep and we proceed w/ downloading the respective files

In [None]:
import ftplib
import os
import pandas as pd  # Assuming you're working with a pandas DataFrame

# Assuming you have a DataFrame `df` with a column of assembly IDs
# Replace 'assembly_id_column' with the actual column name in your DataFrame
assembly_ids = df_keep['Assembly Accession']

# Loop through each assembly ID in the DataFrame
for target_assembly_id in assembly_ids:

    print(f"Processing Assembly ID: {target_assembly_id}")

    # Step 3: Loop through the summaries and find the target Assembly ID
    ftp_path = None
    organism = None

    for summary in summaries["DocumentSummarySet"]["DocumentSummary"]:
        if summary['AssemblyAccession'] == target_assembly_id:
            ftp_path = summary.get('FtpPath_GenBank', None)
            assembly_name = summary['AssemblyName']
            organism = summary.get('Organism', 'N/A').replace(" ", "_")  # Replace spaces with underscores for folder names

            if ftp_path:
                print(f"Found FTP path for Assembly: {assembly_name} (ID: {target_assembly_id}) - {ftp_path}")
                break
    else:
        print(f"Assembly ID {target_assembly_id} not found.")
        continue  # Skip to the next Assembly ID if not found

    # Step 4: Extract the relative directory from the FTP link
    ftp_directory = ftp_path.split("ftp://ftp.ncbi.nlm.nih.gov")[1]

    # Create the 'data' folder and organism-specific folder if they don't exist
    if not os.path.exists('data'):
        os.makedirs('data')

    organism_folder = os.path.join('data', organism)
    if not os.path.exists(organism_folder):
        os.makedirs(organism_folder)

    # Step 5: Connect to the NCBI FTP server and list files
    try:
        # Connect to the NCBI FTP server
        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login()  # Log in as an anonymous user
        
        # Navigate to the directory
        ftp.cwd(ftp_directory)
        
        # List the files in the directory
        files = ftp.nlst()

        # Initialize variables to track found files
        protein_faa_file = None
        genomic_fna_file = None
        
        # Check for the required files
        for file in files:
            if file.endswith("protein.faa.gz"):
                protein_faa_file = file
                print(f"File found: {protein_faa_file}")
            elif file.endswith("genomic.fna.gz"):
                genomic_fna_file = file
                print(f"File found: {genomic_fna_file}")

        # Download the protein.faa.gz file if found
        if protein_faa_file:
            local_filename = os.path.join(organism_folder, protein_faa_file.split('/')[-1])  # Save in organism folder
            with open(local_filename, "wb") as local_file:
                ftp.retrbinary(f"RETR {protein_faa_file}", local_file.write)
                print(f"Downloaded: {local_filename}")
        else:
            print("No file ending with 'protein.faa.gz' found.")

        # Download the genomic.fna.gz file if found, and rename it
        if genomic_fna_file:
            genomic_fna_filename = f"{target_assembly_id}_{assembly_name}_genomic.fna.gz"
            local_filename = os.path.join(organism_folder, genomic_fna_filename)  # Save in organism folder
            with open(local_filename, "wb") as local_file:
                ftp.retrbinary(f"RETR {genomic_fna_file}", local_file.write)
                print(f"Downloaded and renamed to: {local_filename}")
        else:
            print("No file ending with 'genomic.fna.gz' found.")

    except ftplib.error_perm as e:
        print(f"FTP error: {e}")

    finally:
        # Close the FTP connection
        ftp.quit()

Processing Assembly ID: GCA_032158425.1
Found FTP path for Assembly: ASM3215842v1 (ID: GCA_032158425.1) - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/032/158/425/GCA_032158425.1_ASM3215842v1
File found: GCA_032158425.1_ASM3215842v1_genomic.fna.gz
No file ending with 'protein.faa.gz' found.
Downloaded and renamed to: data/Pleurotus_giganteus_(basidiomycete_fungi)/GCA_032158425.1_ASM3215842v1_genomic.fna.gz
Processing Assembly ID: GCA_003313735.1
Found FTP path for Assembly: ASM331373v1 (ID: GCA_003313735.1) - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003/313/735/GCA_003313735.1_ASM331373v1
File found: GCA_003313735.1_ASM331373v1_genomic.fna.gz
No file ending with 'protein.faa.gz' found.
Downloaded and renamed to: data/Pleurotus_platypus_(basidiomycete_fungi)/GCA_003313735.1_ASM331373v1_genomic.fna.gz
Processing Assembly ID: GCA_005298045.1
Found FTP path for Assembly: ASM529804v1 (ID: GCA_005298045.1) - ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/005/298/045/GCA_005298045.1_ASM529804v1
F

In [None]:
perl /home/bioinfolab/Programs/repeats/RepeatMasker//RepeatMasker -e rmblast -dir . -lib /home/bioinfolab/rsiaperas/ntua_fungi/Zerba/pci/genome/repeats/RepeatMasker/../RepeatLibrary/Reps.lib -pa 4 -s -xsmall -gff pci_genome_masked.fa
