Code colated from:
1. Human Genome LM 0 Data Processing
https://github.com/kheyer/Genomic-ULMFiT/blob/master/Mammals/Human/Genomic%20Language%20Models/Human%20Genome%20LM%200%20Data%20Processing.ipynb
2. Human Genome LM 5 3-mer Stride 1 Language Model
https://github.com/kheyer/Genomic-ULMFiT/blob/master/Mammals/Human/Genomic%20Language%20Models/Human%20Genome%20LM%205%203-mer%20Stride%201%20Language%20Model.ipynb

In [1]:
from fastai import *
from fastai.text import *
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
import networkx as nx

In [2]:
import sys
sys.path.append(".\\ULMFiT\\")
from ULMFiT.utils import *

## Human Genome Data Preprocessing

#### Genome data from: https://www.ncbi.nlm.nih.gov/assembly/GCF_000001405.39

In [3]:
read_path = "NCBI_Data\\genome_assemblies_genome_fasta\\ncbi-genomes-2021-03-22\\GCF_000001405.39_GRCh38.p13_genomic.fna"
file = "GCF_000001405.39_GRCh38.p13_genomic.fna"

outPath = "Generated"
expName = "ULMFiT_Humans"
exp_path = os.path.join(outPath, expName)
if(not os.path.isdir(exp_path)):
    os.makedirs(exp_path)



In [4]:
processed_genome_data = process_fasta(os.path.join(read_path, file), 10000, 2000, filter_txt='NC_')

processed_genome_data_df = pd.DataFrame(processed_genome_data, columns=['Sequence'])
processed_genome_data_df['Source'] = 'NCBI Human'

processed_genome_data_df.to_csv(os.path.join(exp_path, 'human_genome_data_fa.csv'), index=False)

## Human Genome Language Model

In [None]:
df_iter = pd.read_csv(os.path.join(exp_path, 'human_genome_data_fa.csv'), chunksize=180000)
df = next(df_iter)
df_val = df[:20000]

# Define tokenizer object
tok = Tokenizer(partial(GenomicTokenizer, ngram=3, stride=1), n_cpus=8, pre_rules=[], post_rules=[], special_cases=['xxpad'])

data = GenomicTextLMDataBunch.from_df(exp_path, df[20000:], df_val, bs=800, tokenizer=tok, 
                                      chunksize=10000, text_cols=0, label_cols=1, max_vocab=80000)

In [None]:
data

In [None]:
## define and store the extracted genome vocabulary
voc = data.vocab.itos
np.save(os.path.join(exp_path, 'human_vocab_3m1s.npy'), voc)

In [None]:
model_vocab = GenomicVocab(voc)

In [None]:
config = dict(emb_sz=400, n_hid=1150, n_layers=3, pad_token=0, qrnn=False, output_p=0.25, 
              hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15, tie_weights=True, out_bias=True)
drop_mult=0.3

In [None]:
learn = get_model_LM(data, drop_mult, config)
learn = learn.to_fp16(dynamic=True)
learn.model

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(2, 5e-3, moms=(0.8, 0.7))

In [None]:
## saving models
learn.save(os.path.join(exp_path, 'human_3m1s'))
learn.save_encoder(os.path.join(exp_path, 'human_3m1s_enc'))

In [None]:
count = 0
lr = 5e-3
for df in df_iter:
    data = GenomicTextLMDataBunch.from_df(path, df, df_val, bs=800, tokenizer=tok, vocab=model_vocab, max_vocab=80000,
                                          chunksize=20000, text_cols=0, label_cols=1)
    learn.data = data
    lr_iter = lr/1.5**count
    print(f'Learning Rate: {lr_iter}')
    learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
    count += 1

In [None]:
## saving models
learn.save(os.path.join(exp_path, 'human_3m1s_2'))
learn.save_encoder(os.path.join(exp_path, 'human_3m1s_enc_2'))

learn = learn.to_fp32();
learn.save(os.path.join(exp_path, 'human_3m1s_2_fp32'))
learn.save_encoder(os.path.join(exp_path, 'human_3m1s_enc_2_fp32'))