In [8]:
import pandas as pd 
from selenobot.utils import DATA_DIR, dataframe_from_fasta
import os
import numpy as np
import subprocess
import time

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Validation

Fitness data for each organism was obtained [here](https://figshare.com/articles/dataset/February_2024_release_of_the_Fitness_Browser_RB-TnSeq_data_for_diverse_bacteria_and_archaea/25236931/1)

In [2]:
MUT_DATA_DIR = os.path.join(DATA_DIR, 'validation', 'mutagenesis')
ORGANISMS = os.listdir(os.path.join(MUT_DATA_DIR, 'fitness')) # Each of the filenames is the name of the organism. 
ORGANISMS = [o.replace('db.StrainFitness.', '') for o in ORGANISMS if o != 'aaseqs'] # Remove prefixes from filenames. 
print(f'Obtained fitness data for {len(ORGANISMS)} organisms.') # NOTE: I thought there was 51?

Obtained fitness data for 49 organisms.


In [3]:
# Download gene metadata for each organism, which contains the locations within the genomes. Put in the MUT_DATA_DIR.
for org in ORGANISMS:
    if f'{org}_genes.tsv' not in os.listdir(MUT_DATA_DIR): # Make sure not to download the file twice.
        output_file = os.path.join(MUT_DATA_DIR, f'{org}_genes.tsv')
        subprocess.run(f'wget https://fit.genomics.lbl.gov/cgi-bin/orgGenes.cgi?orgId={org} -O {output_file}', shell=True, check=True)
        time.sleep(1)

In [9]:
# How many total genes are there?
genes = dataframe_from_fasta(os.path.join(MUT_DATA_DIR, 'aaseqs.fa'), parse_header=False)
# Check to make sure we have about the right number of genes in the file...
print(len(genes), f'total genes obtained. This is about {np.round(len(genes) / len(ORGANISMS))} genes per organism.')

221030 total genes obtained. This is about 4511.0 genes per organism.


In [5]:
df = pd.read_csv(os.path.join(MUT_DATA_DIR, 'fit_organism_acidovorax_3H11.tsv'), index_col=0, delimiter='\t')

In [12]:
df

Unnamed: 0_level_0,locusId,sysName,geneName,desc,set1IT049 Nickel (II) chloride 0.6 mM,set1IT050 Cobalt chloride 0.16 mM,set1IT051 Cobalt chloride 0.32 mM,set1IT052 copper (II) chloride 1 mM,set1IT053 sodium fluoride 25 mM,set1IT055 LB,...,set4IT075 L-Glutamine (N),set4IT077 L-Proline (N),set4IT078 L-Alanine (N),set4IT080 L-Isoleucine (N),set4IT081 L-Leucine (N),set4IT083 L-Phenylalanine (N),set4IT084 L-tyrosine (N),set4IT086 D-Alanine (N),set4IT090 Uridine (N),set4IT092 Ammonium chloride (N)
orgId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acidovorax_3H11,Ac3H11_1,,,"Transcriptional regulator, AraC family",-0.078,-0.178,-0.104,-0.349,-0.201,-0.384,...,-0.216,-0.296,0.049,-0.252,-0.451,-0.190,-0.142,-0.008,0.034,-0.149
acidovorax_3H11,Ac3H11_10,,,Ribosomal large subunit pseudouridine synthase...,0.095,-0.072,0.229,0.071,0.041,0.065,...,-0.299,-0.190,-0.949,0.061,-0.394,-0.128,0.088,-0.077,-0.090,-0.591
acidovorax_3H11,Ac3H11_100,,,Enoyl-CoA hydratase (EC 4.2.1.17),-0.139,-0.256,0.249,-0.015,0.167,-0.103,...,0.000,-0.041,-0.305,0.506,0.645,-0.136,0.412,0.756,0.116,0.323
acidovorax_3H11,Ac3H11_1000,,,Nicotinate-nucleotide--dimethylbenzimidazole p...,-0.288,-1.054,-0.281,0.536,1.023,-1.065,...,0.461,0.098,-3.548,-0.097,-0.978,0.059,-1.069,1.466,-0.237,-0.542
acidovorax_3H11,Ac3H11_1001,,,GGDEF/PAS/PAC-domain containing protein,0.235,0.167,0.249,0.095,0.110,0.194,...,0.058,-0.016,0.184,-0.065,-0.036,-0.059,0.020,0.118,0.069,0.013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
acidovorax_3H11,Ac3H11_994,,,Cob(I)alamin adenosyltransferase (EC 2.5.1.17),0.103,-0.024,-0.467,0.066,0.186,0.065,...,0.359,-0.046,-0.210,0.334,0.253,-0.037,0.357,-0.240,0.114,-0.269
acidovorax_3H11,Ac3H11_995,,,Adenosylcobinamide-phosphate synthase (EC 6.3....,0.169,-0.268,-0.177,-0.306,-0.220,-0.065,...,-0.194,-0.025,-0.717,0.055,-0.064,0.091,-0.154,-0.069,0.188,0.119
acidovorax_3H11,Ac3H11_996,,,L-threonine 3-O-phosphate decarboxylase (EC 4....,-0.078,-0.586,-0.058,0.091,-0.071,-0.099,...,-0.104,-0.040,-0.245,0.225,-0.452,-0.017,-0.074,0.005,-0.411,-0.573
acidovorax_3H11,Ac3H11_997,,,FIG00537295: hypothetical protein,-0.044,0.345,0.045,0.265,0.282,-0.189,...,0.010,-0.023,0.135,-0.187,-0.089,0.170,0.093,0.169,-0.074,0.286
