<a href="https://colab.research.google.com/github/parmarsuraj99/10DaysofMLChallenge/blob/master/notebopoks/COMP8730_proposed_solution_author_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==4.18.0
!pip install datasets==2.0.0
!pip install apache_beam==2.37.0
!pip install sentencepiece==0.1.96
!pip install wandb

In [None]:
!git clone https://github.com/parmarsuraj99/COMP8730_research_project

In [None]:
%cd /content/COMP8730_research_project

## Author prediction training

In [None]:
import random
random.seed(0)

In [None]:
import gdown
gdown.download_folder("https://drive.google.com/drive/folders/1vdloyc7skwlIAN5bEG7JdI4Pyu6JaXBU")

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def prepare_csv(file_dir="inltk_sanskrit_shlokas_dataset"):

    df = pd.read_csv(f"{file_dir}/train.csv")
    df_test = pd.read_csv(f"{file_dir}/valid.csv")

    train_ = df.copy()
    test_ = df_test.copy()

    enc = LabelEncoder()
    train_["Class"] = enc.fit_transform(df["Class"])
    test_["Class"] = enc.transform(df_test["Class"])

    train_.rename(columns={"Class": "label", "Sloka":"text"}, inplace=True)
    test_.rename(columns={"Class": "label", "Sloka":"text"}, inplace=True)

    train_["text"] = train_["text"].str.replace(' +', ' ')
    test_["text"] = test_["text"].str.replace(' +', ' ')

    train_.to_csv("train_processed.csv", index=False)
    test_.to_csv("test_processed.csv", index=False)

prepare_csv()

In [None]:
from datasets import load_dataset

import glob, os, gc
import numpy as np
from IPython.display import clear_output

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

from transformers import TrainingArguments, Trainer
import transformers
import numpy as np
import torch
from datasets import load_metric
import json


os.environ["WANDB_DISABLED"] = "true"
freeze_all=False

dataset = load_dataset('csv', data_files={"train": 'train_processed.csv', "test":"test_processed.csv"})

In [None]:
freeze_all = False

In [None]:
for postfix_ in ["True", "False"]:
    sorted_files = sorted(
        glob.glob(
            f"/content/COMP8730_research_project/COMP8730_NLPU/*/results_scratch_{postfix_}/checkpoint*"
        ),
        key=lambda x: int(x.split("-")[1]),
    )

    for i in range(len(sorted_files)):

        sorted_files_index = i
        config_name = (
            sorted_files[sorted_files_index].split("/")[-2]
            + "_"
            + sorted_files[sorted_files_index].split("/")[-1]
        ).replace("-", "_")

        gc.collect()

        tokenizer = AutoTokenizer.from_pretrained(sorted_files[sorted_files_index])
        model = AutoModelForSequenceClassification.from_pretrained(
            sorted_files[sorted_files_index], num_labels=3
        )
        gc.collect()

        clear_output()
        gc.collect()

        def tokenize_function(examples):
            return tokenizer(
                examples["text"].replace("\n", "").replace("\t", ""),
                padding="max_length",
                truncation=True,
                max_length=128,
            )

        tokenized_datasets = dataset.map(tokenize_function)

        if not freeze_all:
            for name, param in list(model.albert.named_parameters())[:-5]:
                param.requires_grad = False
        else:
            model.albert.requires_grad_(False)

        metric = load_metric("f1")

        def compute_metrics(eval_pred):
            metric1 = load_metric("precision")
            metric2 = load_metric("recall")
            metric_f1 = load_metric("f1")
            metric_acc = load_metric("accuracy")
            
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)

            precision = metric1.compute(predictions=predictions, references=labels, average='weighted')["precision"]
            recall = metric2.compute(predictions=predictions, references=labels, average='weighted')["recall"]
            f1 = metric_f1.compute(predictions=predictions, references=labels, average='weighted')["f1"]
            accuracy = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]

            return {"precision": precision, "recall": recall, "f1":f1, "accuracy":accuracy}

        op_dir = (
            sorted_files[sorted_files_index].split("/")[-2]
            + "_"
            + sorted_files[sorted_files_index].split("/")[-1]
        )

        training_args = TrainingArguments(
            output_dir=op_dir,
            num_train_epochs=10,
            do_train=True,
            do_eval=True,
            logging_strategy="epoch",
            optim="adamw_torch",
            save_strategy="epoch",
            save_total_limit=1,
            learning_rate=1e-6,
            evaluation_strategy="epoch",
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            data_seed=0,
            load_best_model_at_end=True,
        )

        gc.collect()

        gc.collect()
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"],
            eval_dataset=tokenized_datasets["test"],
            compute_metrics=compute_metrics,
        )
        gc.collect()

        results_dict = dict()

        train_stats = trainer.train()
        results_dict["train_stats"] = train_stats
        gc.collect()

        eval_scores = trainer.evaluate()
        results_dict["eval_scores"] = eval_scores
        gc.collect()

        train_scores = trainer.evaluate(tokenized_datasets["train"])
        results_dict["train_scores"] = train_scores
        gc.collect()

        eval_preds = trainer.predict(tokenized_datasets["test"])

        eval_tgt = np.array(tokenized_datasets["test"]["label"])

        with open(f"{config_name}.json", "w") as fp:
            json.dump(results_dict, fp)


In [None]:
import zipfile

In [None]:
with zipfile.ZipFile("results.zip", 'w') as myZip:
    for name in glob.glob("*.json" ):
        myZip.write(name, os.path.basename(name), zipfile.ZIP_DEFLATED)

In [None]:
results_dict = {}
for file_ in glob.glob("*.json"):
    with open(file_, "r") as fp:
        res = json.load(fp)
    tmp = {}
    tmp["eval_f1"] = res["eval_scores"]["eval_f1"]
    results_dict[file_.split(".")[0]] = tmp

In [None]:
results_dict

In [None]:
res_df = pd.DataFrame.from_dict(results_dict).T

In [None]:
import matplotlib.pyplot as plt

In [None]:
print(res_df.loc[[c for c in res_df.index.to_list() if "False" in c]].mean())
res_df.loc[[c for c in res_df.index.to_list() if "False" in c]].plot.barh(figsize=(10,6))
plt.show()

In [None]:
print(res_df.loc[[c for c in res_df.index.to_list() if "True" in c]].mean())
res_df.loc[[c for c in res_df.index.to_list() if "True" in c]].plot.barh(figsize=(10,6))
plt.show()

## Uploading to HF Hub

optional if you'd like to share your models

In [None]:
!pip install huggingface_hub
!sudo apt-get install git-lfs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import random
random.seed(0)
import glob

from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
for postfix_ in ["True", "False"]:
    sorted_files = sorted(
        glob.glob(
            f"/content/COMP8730_research_project/results_scratch_{postfix_}/*/checkpoint*"
        ),
        key=lambda x: int(x.split("-")[1]),
    )
    print(sorted_files[-1])

    if postfix_=="False":
        model_hub_name = "sanbert-from-indicbert"
    else:
        model_hub_name = "sanbert-from-scratch"

    tokenizer = AutoTokenizer.from_pretrained(sorted_files[-1])
    model = AutoModelForMaskedLM.from_pretrained(
            sorted_files[-1]
    )

    model.push_to_hub(model_hub_name, use_temp_dir=True)
    tokenizer.push_to_hub(model_hub_name, use_temp_dir=True)