In [1]:
# ----- SETUP (RUN ONCE) ------
# Before running make sure you set your fork
# You can find this from your forked repo under the code button
# Only run once, or if you CANNOT cd (change directory)
# into the repository
# For this to work (in colab)
# you must have created a GITHUB token
# and added it to colab.

# Make a change

from google.colab import userdata
email = userdata.get('GITHUB_EMAIL')
username = userdata.get('GITHUB_USERNAME')

# Change this to your forked url!!
gh_repo_url = "https://github.com/nkmwicz/fungal-temp-analysis.git"

# NOTE: set your email as your email
!git config --global user.email {email}

# NOTE: Change your username
!git config --global user.name {username}

# NOTE: change the address to the address of your fork!
!git clone {gh_repo_url}

fatal: destination path 'fungal-temp-analysis' already exists and is not an empty directory.


In [2]:
!pip install ncbi-genome-download
!pip install pybarrnap



In [3]:
# ---- Change Directory to repository -----
# Run when first open jupyter notebook.
# If this fails, us above cell.
%cd fungal-temp-analysis

/content/fungal-temp-analysis


In [4]:
%ls

delete_me.csv                     FungiWork.ipynb  README.md
eukaryotes_ncbi_temperatures.csv  LICENSE          temperature_data.tsv


In [5]:
import pandas as pd

df = pd.read_csv("eukaryotes_ncbi_temperatures.csv")
df.head()

Unnamed: 0,#Organism Name,Organism Groups,Strain,BioSample,BioProject,Assembly,Level,Size(Mb),GC%,Replicons,WGS,Scaffolds,CDS,Release Date,GenBank FTP,RefSeq FTP,Temperature (°C)
0,Neopyropia yezoensis,Eukaryota;Other;Other,,SAMN13316713,PRJNA589917,GCA_009829735.1,Chromosome,107.591,64.8454,chromosome 1:CM020618.1; chromosome 2:CM020619...,WMLA01,28,0,2020-01-03T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009...,,
1,Emiliania huxleyi CCMP1516,Eukaryota;Protists;Other Protists,CCMP1516,SAMN02744062,PRJNA77753,GCA_000372725.1,Scaffold,167.676,64.5,,AHAL01,7795,38554,2013-04-19T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,
2,Glycine max,Eukaryota;Plants;Land Plants,,SAMN00002965,PRJNA19861,GCA_000004515.5,Chromosome,978.942,35.1221,chromosome 1:NC_016088.4/CM000834.4; chromosom...,ACUP04,347,74248,2010-01-05T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,
3,Medicago truncatula,Eukaryota;Plants;Land Plants,,SAMN08400029,PRJNA702529,GCA_003473485.2,Chromosome,430.008,33.4462,chromosome 1:NC_053042.1/CM010648.1; chromosom...,PSQE01,42,42683,2018-09-06T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/003...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003...,
4,Solanum lycopersicum,Eukaryota;Plants;Land Plants,,SAMN02981290,PRJNA119,GCA_000188115.3,Chromosome,828.349,35.6991,chromosome 1:NC_015438.3/CM001064.3; chromosom...,AEKE03,3150,37660,2010-12-10T00:00:00Z,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000...,ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000...,


In [6]:
columns = ["#Organism Name", "Organism Groups", "Assembly", "Temperature (°C)"]
df = df[columns]
df.head()

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C)
0,Neopyropia yezoensis,Eukaryota;Other;Other,GCA_009829735.1,
1,Emiliania huxleyi CCMP1516,Eukaryota;Protists;Other Protists,GCA_000372725.1,
2,Glycine max,Eukaryota;Plants;Land Plants,GCA_000004515.5,
3,Medicago truncatula,Eukaryota;Plants;Land Plants,GCA_003473485.2,
4,Solanum lycopersicum,Eukaryota;Plants;Land Plants,GCA_000188115.3,


In [7]:
df = df.dropna(subset="Temperature (°C)")
df.shape

(972, 4)

In [8]:
df = df.loc[df['Organism Groups'].str.contains("Fung")]
df

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C)
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0
...,...,...,...,...
7874,Hanseniaspora lindneri,Eukaryota;Fungi;Ascomycetes,GCA_019649525.1,25.0
7913,Cystobasidium slooffiae,Eukaryota;Fungi;Basidiomycetes,GCA_019775285.1,25.0
7949,[Candida] anglica,Eukaryota;Fungi;Ascomycetes,GCA_019775655.1,25.0
7961,Penicillium brevicompactum,Eukaryota;Fungi;Ascomycetes,GCA_019843585.1,24.0


In [9]:
df.drop_duplicates(subset="Assembly")
df

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C)
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0
...,...,...,...,...
7874,Hanseniaspora lindneri,Eukaryota;Fungi;Ascomycetes,GCA_019649525.1,25.0
7913,Cystobasidium slooffiae,Eukaryota;Fungi;Basidiomycetes,GCA_019775285.1,25.0
7949,[Candida] anglica,Eukaryota;Fungi;Ascomycetes,GCA_019775655.1,25.0
7961,Penicillium brevicompactum,Eukaryota;Fungi;Ascomycetes,GCA_019843585.1,24.0


In [10]:
import re
df['species_root_name'] = df['#Organism Name'].apply(lambda item: " ".join(item.split(" ")[:2]))
df['species_root_name'] = df['species_root_name'].apply(lambda item: re.sub(r"[\[\];'\",\(\).;\-]", "", item))
df

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0,Schizosaccharomyces pombe
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0,Aspergillus nidulans
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0,Aspergillus fumigatus
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0,Neurospora crassa
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0,Candida albicans
...,...,...,...,...,...
7874,Hanseniaspora lindneri,Eukaryota;Fungi;Ascomycetes,GCA_019649525.1,25.0,Hanseniaspora lindneri
7913,Cystobasidium slooffiae,Eukaryota;Fungi;Basidiomycetes,GCA_019775285.1,25.0,Cystobasidium slooffiae
7949,[Candida] anglica,Eukaryota;Fungi;Ascomycetes,GCA_019775655.1,25.0,Candida anglica
7961,Penicillium brevicompactum,Eukaryota;Fungi;Ascomycetes,GCA_019843585.1,24.0,Penicillium brevicompactum


In [11]:
len(df['species_root_name'].unique())

946

In [12]:
df['duplicate'] = df.duplicated(subset="species_root_name", keep=False)
df

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name,duplicate
10,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0,Schizosaccharomyces pombe,False
11,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0,Aspergillus nidulans,False
12,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0,Aspergillus fumigatus,False
13,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0,Neurospora crassa,False
14,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0,Candida albicans,False
...,...,...,...,...,...,...
7874,Hanseniaspora lindneri,Eukaryota;Fungi;Ascomycetes,GCA_019649525.1,25.0,Hanseniaspora lindneri,False
7913,Cystobasidium slooffiae,Eukaryota;Fungi;Basidiomycetes,GCA_019775285.1,25.0,Cystobasidium slooffiae,False
7949,[Candida] anglica,Eukaryota;Fungi;Ascomycetes,GCA_019775655.1,25.0,Candida anglica,False
7961,Penicillium brevicompactum,Eukaryota;Fungi;Ascomycetes,GCA_019843585.1,24.0,Penicillium brevicompactum,False


In [13]:
df = df.loc[~df['#Organism Name'].str.contains(" cf. ")]
df.shape

(954, 6)

In [14]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name,duplicate
0,Schizosaccharomyces pombe,Eukaryota;Fungi;Ascomycetes,GCA_000002945.2,25.0,Schizosaccharomyces pombe,False
1,Aspergillus nidulans FGSC A4,Eukaryota;Fungi;Ascomycetes,GCA_000149205.2,26.0,Aspergillus nidulans,False
2,Aspergillus fumigatus Af293,Eukaryota;Fungi;Ascomycetes,GCA_000002655.1,28.0,Aspergillus fumigatus,False
3,Neurospora crassa OR74A,Eukaryota;Fungi;Ascomycetes,GCA_000182925.2,25.0,Neurospora crassa,False
4,Candida albicans SC5314,Eukaryota;Fungi;Ascomycetes,GCA_000182965.3,25.0,Candida albicans,False


In [16]:
import subprocess, requests, tarfile, os
from zipfile import ZipFile
from pybarrnap import Barrnap
from pybarrnap.utils import load_example_fasta_file

# get fasta files
# unzip fasta
# barrnap fasta file for quality.
# get tRNA & save it in df
# scrape web for it????
for row in range(1):
    assembly = df.at[row, 'Assembly']
    url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{assembly}/download?include_annotation_type=GENOME_FASTA"
    res = requests.get(url)  # this returns a zip folder

    # Define the filename for the downloaded zip file
    zip_filename = f"{assembly}.zip"
    # Save the zip file
    with open(zip_filename, 'wb') as f:
        for chunk in res.iter_content(chunk_size=8192):
            f.write(chunk)
    extract_dir = f"{assembly}_dir"
    os.mkdir(extract_dir)

    with ZipFile(zip_filename, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    %cd {extract_dir}/ncbi_dataset/data/{assembly}

    fasta_file = os.listdir()[0] # => []
    # Run pybarrnap rRNA prediction
    barrnap = Barrnap(
        fasta_file,
        evalue=1e-6,
        lencutoff=0.8,
        reject=0.25,
        threads=1,
        kingdom="euk",
        accurate=False,
        quiet=False,
    )
    result = barrnap.run()
    # Get rRNA GFF text and print
    print("\n========== Print rRNA GFF ==========")
    print(result.get_gff_text())

    # Get rRNA features and print
    print("\n========== Print rRNA features ==========")
    for rec in result.seq_records:
        for feature in rec.features:
            print(feature) #feature.id, feature.type, feature.location, feature.qualifiers)
            # all three types had to be identified
            # must = 5S, 5.8S, 18S, 28S

    # Do stuff here with the Fasta file!
    # Pybarrnap
    # Create a quality metric & save to df
    # Save quality metric to df
    # Find tRNA from the fasta
    # Save tRNA to df

    %cd ../../../../
    %ls
    %rm -rf {extract_dir}
    # !ncbi-genome-download --section genbank --assembly-accessions {assembly} --formats fasta fungi



/content/fungal-temp-analysis/GCA_000002945.2_dir/ncbi_dataset/data/GCA_000002945.2

##gff-version 3
CU329670.1	pybarrnap:0.5.1	rRNA	149660	149774	7.8e-22	+	.	Name=5S_rRNA;product=5S ribosomal RNA;Dbxref=RFAM:RF00001
CU329670.1	pybarrnap:0.5.1	rRNA	456529	456643	4.7e-22	+	.	Name=5S_rRNA;product=5S ribosomal RNA;Dbxref=RFAM:RF00001
CU329670.1	pybarrnap:0.5.1	rRNA	912232	912346	1.9e-21	-	.	Name=5S_rRNA;product=5S ribosomal RNA;Dbxref=RFAM:RF00001
CU329670.1	pybarrnap:0.5.1	rRNA	1563477	1563591	2.1e-21	+	.	Name=5S_rRNA;product=5S ribosomal RNA;Dbxref=RFAM:RF00001
CU329670.1	pybarrnap:0.5.1	rRNA	2976821	2976935	7.8e-22	+	.	Name=5S_rRNA;product=5S ribosomal RNA;Dbxref=RFAM:RF00001
CU329670.1	pybarrnap:0.5.1	rRNA	3165398	3165512	7.8e-22	+	.	Name=5S_rRNA;product=5S ribosomal RNA;Dbxref=RFAM:RF00001
CU329670.1	pybarrnap:0.5.1	rRNA	3547902	3548016	7.9e-20	-	.	Name=5S_rRNA;product=5S ribosomal RNA;Dbxref=RFAM:RF00001
CU329670.1	pybarrnap:0.5.1	rRNA	3634373	3634487	7.8e-22	+	.	Name=5S_rRNA;produc

In [None]:
%ls

delete_me.csv                     FungiWork.ipynb       GCA_000002945.2.zip  README.md
eukaryotes_ncbi_temperatures.csv  [0m[01;34mGCA_000002945.2_dir[0m/  LICENSE              temperature_data.tsv


In [None]:
%ls

delete_me.csv                     FungiWork.ipynb       GCA_000002945.2.zip  README.md
eukaryotes_ncbi_temperatures.csv  [0m[01;34mGCA_000002945.2_dir[0m/  LICENSE              temperature_data.tsv


In [None]:
%cd data/
%ls

/content/fungal-temp-analysis/GCA_000002945.2_dir/GCA_000002945.2_dir/ncbi_dataset/data
assembly_data_report.jsonl  dataset_catalog.json  [0m[01;34mGCA_000002945.2[0m/


In [None]:
%cd {assembly}
%ls

/content/fungal-temp-analysis/GCA_000002945.2_dir/GCA_000002945.2_dir/ncbi_dataset/data/GCA_000002945.2
GCA_000002945.2_ASM294v2_genomic.fna


In [None]:
%rm -rf {extract_dir}

In [None]:
duplicated_species = df.loc[df['duplicate'] == True]
duplicated_species

Unnamed: 0,#Organism Name,Organism Groups,Assembly,Temperature (°C),species_root_name,duplicate
47,Cryptococcus neoformans var. neoformans JEC21,Eukaryota;Fungi;Basidiomycetes,GCA_000091045.1,27.0,Cryptococcus neoformans,True
1069,Saccharomyces cerevisiae x Saccharomyces kudri...,Eukaryota;Fungi;Ascomycetes,GCA_009665985.1,25.0,Saccharomyces cerevisiae,True
1495,Ogataea polymorpha,Eukaryota;Fungi;Ascomycetes,GCA_001664045.1,25.0,Ogataea polymorpha,True
1541,Saccharomyces cerevisiae x Saccharomyces uvarum,Eukaryota;Fungi;Ascomycetes,GCA_013180185.1,25.0,Saccharomyces cerevisiae,True
2300,Magnusiomyces capitatus NRRL Y-17686,Eukaryota;Fungi;Ascomycetes,GCA_900497725.1,25.0,Magnusiomyces capitatus,True
3227,Saccharomycopsis fibuligera,Eukaryota;Fungi;Ascomycetes,GCA_001936155.1,25.0,Saccharomycopsis fibuligera,True
4451,Magnusiomyces capitatus CNRMA 12.647,Eukaryota;Fungi;Ascomycetes,GCA_000817185.1,25.0,Magnusiomyces capitatus,True
4962,Cryptococcus neoformans AD hybrid,Eukaryota;Fungi;Basidiomycetes,GCA_006992865.1,27.0,Cryptococcus neoformans,True
5187,Saccharomyces cerevisiae x Saccharomyces eubay...,Eukaryota;Fungi;Ascomycetes,GCA_009665555.1,25.0,Saccharomyces cerevisiae,True
5188,Saccharomyces cerevisiae x Saccharomyces eubay...,Eukaryota;Fungi;Ascomycetes,GCA_009666275.1,25.0,Saccharomyces cerevisiae,True


In [None]:
!pip install ncbi-genome-download

Collecting ncbi-genome-download
  Downloading ncbi_genome_download-0.3.3-py2.py3-none-any.whl.metadata (10 kB)
Collecting appdirs (from ncbi-genome-download)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Downloading ncbi_genome_download-0.3.3-py2.py3-none-any.whl (26 kB)
Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: appdirs, ncbi-genome-download
Successfully installed appdirs-1.4.4 ncbi-genome-download-0.3.3


In [None]:
df["short_name"] = df["#Organism Name"].str.split(" ")[0]
df


KeyError: 0

In [None]:
# make sure to run this cell to have access to this function below
def commit_and_push(repo_url, message):
    import os
    import subprocess
    from google.colab import userdata

    try:
        # change your username from mine to yours.
        username = userdata.get("GITHUB_USERNAME")
        token = userdata.get("GITHUB_TOKEN")
        !git add .
        !git commit -m {message}
        !git push https://{username}:{token}@github.com/{username}/fungal-temp-analysis.git
        print("Changes Saved to GitHub!")
    except Exception as e:
        print(e)

# Only call this function if you want to
def catch_up_to_main_repo():
    import os
    from google.colab import userdata
    try:
        username = userdata.get("GITHUB_USERNAME")
        token = userdata.get("GITHUB_TOKEN")
        main_repo = "https://github.com/nkmwicz/fungal-temp-analysis.git"
        !git remote add upstream {main_repo}
        !git fetch upstream
        !git merge upstream/main
        !git push https://{username}:{token}@github.com/{username}/fungal-temp-analysis.git
    except Exception as e:
        print(e)

In [None]:
# SOME NOTES
# %ls shows the available files in the repo to get names of files
# to read a tsv, use pd.read_csv but pass in sep="\t"
# to reperesent tabs as separators
%ls

delete_me.csv                     FungiWork.ipynb  README.md
eukaryotes_ncbi_temperatures.csv  LICENSE          temperature_data.tsv


In [None]:
# columns needed
# remove rows that aren't fungi ()
# remove those without temp.
# name, assembly, temp

In [None]:
import pandas as pd
df = pd.read_csv("./delete_me.csv")
df.loc[len(df)] = [11,12,13,14]
df.to_csv("./delete_me.csv", index=False)

In [None]:
# ----Keep as last cell----
# Use to save changes in repo that are not in this file.
# To save this file, use ctrl + s
# Then set commit message location, branch, etc.

# Change repo url to your forked url
url = "https://github.com/nkmwicz/fungal-temp-analysis.git"
# Change commit message
commit_and_push(url, "made changes to csv")

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 313 bytes | 313.00 KiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/nkmwicz/fungal-temp-analysis.git
   c74cf0e..cf49267  main -> main
Changes Saved to GitHub!
