## Experiments with teacher and student network

In [2]:
import os
import pathlib
from dotenv import load_dotenv
from datasets import Dataset, DatasetDict
import pandas as pd
from src.data.s3_communication import S3Communication
import config
from torch import cuda
import transformers
from pathlib import Path
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
device = 'cuda' if cuda.is_available() else 'cpu'
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch

In [3]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

In [4]:
# init s3 connector
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

## Process dataset for sparseml training

In [5]:
s3c.download_files_in_prefix_to_dir(
    config.BASE_TRAIN_TEST_DATASET_S3_PREFIX,
    config.BASE_PROCESSED_DATA)

In [6]:
s3c.download_files_in_prefix_to_dir(
    config.BASE_TRAIN_TEST_DATASET_S3_PREFIX,
    config.BASE_PROCESSED_DATA)

In [7]:
test_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_test_split.csv'
test_data = pd.read_csv(test_data_path, index_col=0)
test_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

train_data_path = str(config.BASE_PROCESSED_DATA)+'/rel_train_split.csv'
train_data = pd.read_csv(train_data_path, index_col=0)
train_data.rename(columns={'text': 'question', 'text_b':'sentence'}, inplace=True)

train_data.to_csv(train_data_path)
test_data.to_csv(test_data_path)

In [8]:
trds = Dataset.from_pandas(train_data)
teds = Dataset.from_pandas(test_data.drop('label', axis=1))

climate_dataset = DatasetDict()

climate_dataset['train'] = trds
climate_dataset['test'] = teds

In [9]:
stubs = ['12layer_pruned80-none',
         '12layer_pruned90-none',
         '6layer_pruned90-none',
         '3layer_pruned90-none',]
         
## Add more above

## Check if it works or not using the following code

from sparsezoo import Model
stub = "zoo:nlp/masked_language_modeling/bert-base/pytorch/huggingface/wikipedia_bookcorpus/3layer_pruned90-none"
download_directory = "model_download_directory"

model = Model(stub, download_path = download_directory)

ModuleNotFoundError: No module named 'sparsezoo'

# Teacher model

In [61]:
!sparseml.transformers.text_classification --help

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
usage: sparseml.transformers.text_classification [-h] --model_name_or_path
                                                 MODEL_NAME_OR_PATH
                                                 [--config_name CONFIG_NAME]
                                                 [--tokenizer_name TOKENIZER_NAME]
                                                 [--cache_dir CACHE_DIR]
                                                 [--use_fast_tokenizer [USE_FAST_TOKENIZER]]
                                                 [--no_use_fast_tokenizer]
                                                 [--use_teacher_tokenizer [USE_TEACHER_TOKENIZER]]
                                                 [--model_revision MODEL_R

In [45]:
!sparseml.transformers.text_classification \
--model_name_or_path bert-base-uncased \
--train_file '/opt/app-root/src/data/processed/rel_train_split.csv' \
--validation_file '/opt/app-root/src/data/processed/rel_test_split.csv' \
--label_column_name 'label' \
--input_column_name 'question,sentence' \
--do_train --do_eval --evaluation_strategy epoch \
--per_device_train_batch_size 32 \
--learning_rate 5e-5 \
--max_seq_length 128 \
--output_dir models/teacher \
--num_train_epochs 8 \
--metric_for_best_model 'f1' \
--overwrite_output_dir \
--seed 2021

# Sparse student model

#### Change the model_name_or_path in the next cell based on the stubs we want to try

In [7]:
!sparseml.transformers.text_classification \
--model_name_or_path zoo:nlp/masked_language_modeling/bert-base/pytorch/huggingface/bookcorpus_wikitext/12layer_pruned80-none \
--distill_teacher models/teacher \
--train_file '/opt/app-root/src/data/processed/rel_train_split.csv' \
--validation_file '/opt/app-root/src/data/processed/rel_test_split.csv' \
--label_column_name 'label' \
--input_column_name 'question,sentence' \
--do_train \
--per_device_train_batch_size 16 \
--learning_rate 1e-4 \
--warmup_steps 11000 \
--output_dir models/12layer_pruned80-none \
--seed 11712 \
--num_train_epochs 50 \
--save_strategy epoch \
--save_total_limit 1 \
--metric_for_best_model 'f1' \
--overwrite_output_dir \
--recipe zoo:nlp/masked_language_modeling/bert-base/pytorch/huggingface/bookcorpus_wikitext/12layer_pruned80-none?recipe_type=transfer-QQP

2022-12-01 17:35:26 sparseml.transformers.text_classification INFO     Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
best_model_after_epoch=None,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
distill_teacher=models/teacher,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
label_names=None,
label_smoothing_factor=0.0,
learning

In [10]:
trds = Dataset.from_pandas(train_data)
teds = Dataset.from_pandas(test_data.drop('label', axis=1))

climate_dataset = DatasetDict()

climate_dataset['train'] = trds
climate_dataset['test'] = teds

In [17]:
def create_batches(data_df, tokenizer, batch_size=32):
    encoded_dataset = list()
    batch = list()
    for df, row in data_df.iterrows():
        if len(batch) < batch_size:
            batch.append([row['question'], row['sentence']])
        else:
            encoded_dataset.append(tokenizer(batch,
                                             truncation=True,
                                             return_tensors='pt',
                                             padding=True))
            batch = [[row['question'], row['sentence']]]

    if batch:
        encoded_dataset.append(tokenizer(batch,
                                         truncation=True,
                                         return_tensors='pt',
                                         padding=True))
    return encoded_dataset

def predict(encoded_dataset, model):
    outputs = list()
    for batch in encoded_dataset:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
            outs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs.extend(outs.logits.argmax(axis=1).tolist())
    return outputs

def get_model_f1score(model_path, test_data):
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    
    encoded_dataset = create_batches(test_data, tokenizer)
    test_data["pred"] = predict(encoded_dataset, model)
    
    groups = test_data.groupby("question")
    scores = {}
    for group, data in groups:
        pred = data.pred
        true = data.label
        scores[group] = {}
        scores[group]["accuracy"] = accuracy_score(true, pred)
        scores[group]["f1_score"] = f1_score(true, pred)
        scores[group]["recall_score"] = recall_score(true, pred)
        scores[group]["precision_score"] = precision_score(true, pred)
        scores[group]["support"] = len(pred)

    # kpi wise performance metrics
    scores_df = pd.DataFrame(scores)
    return scores_df.loc['f1_score'].mean()

In [18]:
get_model_f1score('models/teacher', test_data)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.9124404257977322

In [19]:
get_model_f1score('models/12layer_pruned80-none', test_data)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.8040874504003634

#### The student network accuracy is not close to the teacher model. We need to figure out why. Is it because the transfer recipe is qqp?