# Connect to Drive


# Fetching

In [None]:
!git clone https://github.com/phamvlap/en-text-sum-fine-tuned-bart.git

In [None]:
%cd en-text-sum-fine-tuned-bart

# Set up

In [None]:
# !pip uninstall torchmetrics -y
!pip install -r requirements.txt
!pip install datasets

In [None]:
import os

os.environ['HUGGINGFACE_USERNAME'] = '<HUGGINGFACE_USERNAME>'
os.environ['HUGGINGFACE_TOKEN'] = '<HUGGINGFACE_TOKEN>'

# Loading dataset

In [None]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset('phamvlap/wikihow')

df = pd.DataFrame()

for split in dataset:
    df_split = pd.DataFrame(dataset[split])
    df = pd.concat([df, df_split])

!mkdir dataset

df.to_csv('dataset/raw.csv')
dataset['train'].to_csv('dataset/train.csv')
dataset['validation'].to_csv('dataset/val.csv')
dataset['test'].to_csv('dataset/test.csv')

# Config

In [None]:
!cp config/config.yaml config/setting_config.yaml

# Prepare dataset

# Train tokenizer

In [None]:
!python run_training_tokenizer.py \
    --shared_vocab \
    --vocab_size=50265 \
    --min_freq=2 \
    --model_type=byte_level_bpe \
    --show_progress

# Split dataset

In [None]:
!python run_splitting_dataset.py \
    --truncate_exceeded_length \
    --seq_length=512 \
    --train_size=0.75 \
    --val_size=0.1 \
    --test_size=0.15
    # --sampling \
    # --num_samples \

# Train model

In [None]:
!python run_training_model.py \
    --vocab_size=50265 \
    --d_model=768 \
    --encoder_layers=6 \
    --decoder_layers=6 \
    --encoder_attention_heads=12 \
    --decoder_attention_heads=12 \
    --encoder_ffn_dim=3072 \
    --decoder_ffn_dim=3072 \
    --activation_function='gelu' \
    --dropout=0.3 \
    --attention_dropout=0.3 \
    --activation_dropout=0.3 \
    --classifier_dropout=0.0 \
    --max_position_embeddings=1024 \
    --encoder_layerdrop=0.1 \
    --decoder_layerdrop=0.1 \
    --num_beams=4 \
    --checkpoint_dir='checkpoints' \
    --model_basename='bart_model' \
    --model_name_or_path='facebook/bart-base' \
    --batch_size_train=48 \
    --batch_size_val=16 \
    --shuffle_dataloader \
    --num_workers=2 \
    --optimizer='adamw' \
    --lr=0.3 \
    --betas='0.9,0.999' \
    --eps=1e-5 \
    --weight_decay=0.001 \
    --lr_scheduler='noam' \
    --warmup_steps=400 \
    --T_0=10 \
    --T_mult=2 \
    --eta_min=1e-5 \
    --label_smoothing=0.1 \
    --epochs=3 \
    --eval_every_n_steps=300 \
    --save_every_n_steps=1000 \
    --max_grad_norm=1.0 \
    --f16_precision \
    --max_eval_steps=50 \
    --max_train_steps=-1 \
    --max_saved_checkpoints=3 \
    --src_seq_length=512 \
    --tgt_seq_length=512 \
    --use_stemmer \
    --accumulate='avg' \
    --beam_size=4 \
    --topk=3 \
    --resume_from_checkpoint \
    --logging_wandb \
    --wandb_project_name='en-text-sum-fine-tuned-bart' \
    --wandb_log_dir='wandb-logs' \
    --wandb_key='<wandb_key>' \
    --push_to_hub \
    --hub_repo_name='text-summarization-finetuned-bart' \
    # --scale_embedding \
    # --resume_from_id='' \
    # --log_examples \
    # --logging_steps=10 \
    # --attach_text \
    # --show_eval_progress \