In [32]:
from bio_files_processor import OpenFasta
from das_biotools import run_genscan, DNASequence, AminoAcidSequence, RNASequence
from custom_random_forest import RandomForestClassifierCustom

from sklearn.datasets import make_classification

import pandas as pd

In [33]:
%reload_ext autoreload
%autoreload 2

# 1. Custom Random Forest

In [34]:
X, y = make_classification(n_samples=100000)
n_job_1_random_forest = RandomForestClassifierCustom(
    max_depth=30, n_estimators=100, max_features=8, random_state=42
)
n_job_2_random_forest = RandomForestClassifierCustom(
    max_depth=30, n_estimators=100, max_features=8, random_state=42
)

In [35]:
%time n_job_1_random_forest.fit(X,y,n_jobs=1)

%time n_job_2_random_forest.fit(X,y,n_jobs=2)

CPU times: user 61.7 ms, sys: 107 ms, total: 169 ms
Wall time: 2min 22s
CPU times: user 62.4 ms, sys: 134 ms, total: 196 ms
Wall time: 1min 17s


In [36]:
%time n_jobs_1_pred = n_job_1_random_forest.predict(X, n_jobs=1)

%time n_jobs_2_pred = n_job_2_random_forest.predict(X, n_jobs=2)

CPU times: user 40.7 ms, sys: 115 ms, total: 156 ms
Wall time: 2.63 s
CPU times: user 68.9 ms, sys: 208 ms, total: 277 ms
Wall time: 2.3 s


In [37]:
all(n_jobs_1_pred == n_jobs_2_pred)

True

# 2. OpenFasta

In [38]:
with OpenFasta("data/example_fasta.fasta") as fasta_file:
    print(fasta_file.read_record())

ID: >GTD323452,
 Description: 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+),
 Sequence:ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG



In [39]:
with OpenFasta("data/example_fasta.fasta") as fasta_file:
    print(fasta_file.read_records())

[ID: >GTD323452,
 Description: 5S_rRNA NODE_272_length_223_cov_0.720238:18-129(+),
 Sequence:ACGGCCATAGGACTTTGAAAGCACCGCATCCCGTCCGATCTGCGAAGTTAACCAAGATGCCGCCTGGTTAGTACCATGGTGGGGGACCACATGGGAATCCCTGGTGCTGTG
, ID: >GTD678345,
 Description: 16S_rRNA NODE_80_length_720_cov_1.094737:313-719(+),
 Sequence:TTGGCTTCTTAGAGGGACTTTTGATGTTTAATCAAAGGAAGTTTGAGGCAATAACAGGTCTGTGATGCCCTTAGATGTTCTGGGCCGCACGCGCGCTACACTGAGCCCTTGGGAGTGGTCCATTTGAGCCGGCAACGGCACGTTTGGACTGCAAACTTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGT
, ID: >GTD174893,
 Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:2153860-2155398(+),
 Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAGGAAACAGCTTGCTGTTTCGCTGACGAGTGGGAAGTAGGTAGCTTAACCTTCGGGAGGGCGCTTACCACTTTGTGATTCATGACTGGGGTGAAGTCGTAACAAGGTAACCGTAGGGGAACCTGCGGTTGGATCACCTCCTT
, ID: >GTD906783,
 Description: 16S_rRNA NODE_1_length_2558431_cov_75.185164:793941-795479(-),
 Sequence:TTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAACGGTAACAG

# 3. run_genscan

In [40]:
file = "data/myc.fna"

output = run_genscan(sequence_file=file)

print("Status:", output.status, "\n")
print("Predicted CDS List:")
print(output.cds_list[0]["peptide"], ":", output.cds_list[0]["sequence"])

exon_df = pd.DataFrame(output.exon_list)
print("\nExon List:")
print(exon_df)

intron_df = pd.DataFrame(output.intron_list)
print("\nIntron List:")
print(intron_df)

Status: 200 

Predicted CDS List:
['GENSCAN_predicted_peptide_1', '1364_aa'] : MRFVKQYCYGGAAEKGRGFEREQKKMPAVRNPCRIHETLPIAAGGHFALELTTPEQGRDSPDAGRLFCPFGDTSPPLPGPASLKGSPCSCLDAGFFSGSGKPALTPLPRPPSGVLAPEMRRNCEERGSGRFQNSCYPWWGGSGGGIAAGSLAQLHLRIECEGRWRKTLCLGFWQIVFLTATSRGFLRAPGPISIPLPLRGRLPGFALRAPGGAGARRAPSRWFTKCVSEIAGDCPKGQPPATMPLNVSFTNRNYDLDYDSVQPYFYCDEEENFYQQQQQSELQPPAPSEDIWKKFELLPTPPLSPSRRSGLCSPSYVAVTPFSLRGDNDGGGGSFSTADQLEMVTELLGGDMVNQSFICDPDDETFIKNIIIQDCMWSGFSAAAKLVSEKLASYQAARKDSGSPNPARGHSVCSTSSLYLQDLSAAASECIDPSVVFPYPLNDSSSPKSCASQDSSAFSPSSDSLLSSTESSPQGSPEPLVLHEETPPTTSSDSEEEQEDEEEIDVVSVEKRQAPGKRSESGSPSAGGHSKPPHSPLVLKRCHVSTHQHNYAAPPSTRKDYPAAKRVKLDSVRVLRQISNNRKCTSPRSSDTEENVKRRTHNVLERQRRNELKRSFFALRDQIPELENNEKAPKVVILKKATAYILSVQAEEQKLISEEDLLRKRREQLKHKLEQLRNSCATRMRFVKQYCYGGAAEKGRGFEREQKKMPAVRNPCRIHETLPIAAGGHFALELTTPEQGRDSPDAGRLFCPFGDTSPPLPGPASLKGSPCSCLDAGFFSGSGKPALTPLPRPPSGVLAPEMRRNCEERGSGRFQNSCYPWWGGSGGGIAAGSLAQLHLRIECEGRWRKTLCLGFWQIVFLTATSRGFLRAPGPISIPLPLRGRLPGFALRAPGGAGARRAPSRWFTKCVSEIAGDCPKGQ

# 4. AminoAcidSequence / DNASequence / RNASequence

In [41]:
aa_seq = AminoAcidSequence("LIMM")
aa_seq.calculate_aa_freq()

{'L': 1, 'I': 1, 'M': 2}

In [42]:
dna_seq = DNASequence("ATGC")
print(dna_seq.transcribe())

type(dna_seq.transcribe())

AUGC


das_biotools.RNASequence

In [43]:
rna_seq = RNASequence("AUGC")
rna_seq.complement()

UACG