## Create conda environment

Cell output cleaned up as it takes up a lot of space in the notebook.

In [None]:
%%bash
conda create -n dnabert python=3.6

In [None]:
%%bash
source activate dnabert && conda env list

##  Install packages

In [None]:
!git clone https://github.com/jerryji1993/DNABERT

In [None]:
!conda install pytorch=1.4 torchvision cudatoolkit=10.0 -c pytorch --yes

In [None]:
!nvidia-smi

In [None]:
import os
os.chdir("./DNABERT")

In [None]:
!python3 -m pip install --editable .

In [None]:
os.chdir("./examples")

In [None]:
!python3 -m pip install --upgrade setuptools

In [None]:
!python3 -m pip install -r requirements.txt

In [None]:
!python3 -m pip install urllib3==1.25.4

In [None]:
!python3 -m pip install future

In [None]:
#%%bash
#conda update -n base -c defaults conda

## apex install

In [None]:
!conda install -c conda-forge nvidia-apex  --y 

In [None]:
# git clone https://github.com/NVIDIA/apex
# cd apex
# pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./

In [None]:
# pip install -v --disable-pip-version-check --no-cache-dir ./

## fine tuning

In [None]:
!nvidia-smi

In [None]:
!pwd

## code for distributed training and 16-bits training

In [None]:
%%bash
export KMER=6
export MODEL_PATH=./model
export DATA_PATH=sample_data/ft/$KMER
export OUTPUT_PATH=./ft/$KMER

CUDA_VISIBLE_DEVICES=0,1,2 python -m torch.distributed.launch --nproc_per_node=3 run_finetune.py \
    --model_type dna \
    --tokenizer_name=dna$KMER \
    --model_name_or_path $MODEL_PATH \
    --task_name dnaprom \
    --fp16 \
    --do_train \
    --do_eval \
    --data_dir $DATA_PATH \
    --max_seq_length 300 \
    --per_gpu_eval_batch_size=32  \
    --per_gpu_train_batch_size=32 \
    --learning_rate 2e-4 \
    --num_train_epochs 4.0 \
    --output_dir $OUTPUT_PATH \
    --evaluate_during_training \
    --logging_steps 100 \
    --save_steps 4000 \
    --warmup_percent 0.1 \
    --hidden_dropout_prob 0.1 \
    --overwrite_output \
    --weight_decay 0.01 \
    --n_process 8

In [None]:
## prediction

In [None]:
%%bash
export KMER=6
export MODEL_PATH=./ft/6
export DATA_PATH=sample_data/ft/$KMER
export PREDICTION_PATH=./result/$KMER

CUDA_VISIBLE_DEVICES=0,1,2 python -m torch.distributed.launch --nproc_per_node=3 run_finetune.py \
    --model_type dna \
    --fp16 \
    --tokenizer_name=dna$KMER \
    --model_name_or_path $MODEL_PATH \
    --task_name dnaprom \
    --do_predict \
    --data_dir $DATA_PATH  \
    --max_seq_length 300 \
    --per_gpu_pred_batch_size=32  \
    --output_dir $MODEL_PATH \
    --predict_dir $PREDICTION_PATH \
    --n_process 48

In [None]:
## attention 

In [None]:
!nvidia-smi

In [None]:
%%bash
export KMER=6
export MODEL_PATH=./ft/$KMER
export DATA_PATH=sample_data/ft/$KMER
export PREDICTION_PATH=./result/$KMER

CUDA_VISIBLE_DEVICES=0,1,2 python -m torch.distributed.launch --nproc_per_node=3 run_finetune.py \
    --model_type dna \
    --tokenizer_name=dna$KMER \
    --fp16 \
    --model_name_or_path $MODEL_PATH \
    --task_name dnaprom \
    --do_visualize \
    --visualize_data_dir $DATA_PATH \
    --visualize_models $KMER \
    --data_dir $DATA_PATH \
    --max_seq_length 300 \
    --per_gpu_pred_batch_size=8   \
    --output_dir $MODEL_PATH \
    --predict_dir $PREDICTION_PATH \
    --n_process 96

In [None]:
!pip install seaborn

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
data = np.load('./result/6/atten.npy')

In [None]:
df = pd.read_csv('./sample_data/ft/6/dev.tsv', sep='\t')

In [None]:
data = pd.DataFrame(data)

In [None]:
data['lbl'] = df['label']

In [None]:
true_df = data[data['lbl'] == 1]
true_df = true_df.drop('lbl', axis=1)
true_df.reset_index(drop=True, inplace=True)

In [None]:
data.max()

In [None]:
fig, ax = plt.subplots(figsize=(4,5)) 
sns.heatmap(true_df, cmap="YlGnBu", xticklabels=50, yticklabels=1000)
plt.savefig('plot2.png')

In [None]:
fig, ax = plt.subplots(figsize=(4,1)) 
plt.plot(true_df.mean(axis=0))
plt.savefig('plot.png')