#Installing Libraries

In [1]:
!pip install torchmetrics datasets transformers[sentencepiece]
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##Data Prepration

In [2]:
import pickle
import os
from google.cloud import storage
from google.cloud.storage.blob import Blob
from google.colab import auth
import pandas as pd
import numpy as np
import json
import glob
import sklearn
from datetime import datetime
from pathlib import Path
from sklearn.model_selection import train_test_split
from datasets import Dataset, Value, ClassLabel, Features
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import torch
from datasets import load_metric


from transformers import create_optimizer
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from transformers import TFAutoModelForSequenceClassification
import evaluate
from transformers import DataCollatorWithPadding



pd.options.mode.chained_assignment = None 


project_id = 'training-datasets-284316'
auth.authenticate_user()
!gcloud config set project {project_id}



def download_file_from_gcp( source_file, destination_folder, 
                            gcp_project = "training-datasets-284316",
                            gcp_bucket_name = "cs-dataset-classifier"
                        ):
    storage_client = storage.Client(gcp_project)
    bucket = storage_client.get_bucket(gcp_bucket_name)
    blob = bucket.blob(source_file)
    destination_file = Path(destination_folder) / Path(source_file).name

    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    blob.download_to_filename(destination_file)

    return str(destination_file)



def upload_blob(source_file_name, 
                destination_blob_name, 
                gcp_project = "training-datasets-284316", 
                gcp_bucket_name = "cs-dataset-classifier"):

    storage_client = storage.Client(gcp_project)
    bucket = storage_client.get_bucket(gcp_bucket_name)
    blob = Blob(destination_blob_name, bucket)
    blob.upload_from_filename(source_file_name)




def download_folder_from_gcp(
                             gcp_project = "training-datasets-284316",
                             gcp_bucket_name = "cs-dataset-classifier",
                             gcp_path = "dictionaries"
                            ):

    if not os.path.exists(gcp_path):
        os.makedirs(gcp_path)

    if gcp_path[-1] != "/":
        gcp_path += "/"

    storage_client = storage.Client(gcp_project)
    bucket = storage_client.get_bucket(gcp_bucket_name)
    blobs=list(bucket.list_blobs(prefix=gcp_path, delimiter="/"))
    
    print("Copying files from gcp bucket : " + gcp_bucket_name)
    
    for blob in blobs:
        if(not blob.name.endswith("/")):
            print (blob.name)
            blob.download_to_filename(blob.name)


def save_model(save_model_path):

    if not os.path.exists(save_model_path):
        os.mkdir(save_model_path)

    for f in os.listdir(save_model_path):
        os.remove(os.path.join(save_model_path, f))

    model.save_pretrained(save_model_path)


Updated property [core/project].


# Hyperparameters

In [3]:
SAVE_MODEL_PATH = "./best_model"

TEST_SIZE = 0.20
CHECK_POINT = "distilbert-base-uncased"
BATCH_SIZE = 16
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5
SAMPLE_NUM = 50000

#Load `dataset_with_clusters.csv` gained from previous step

In [4]:
path_to_config = "tmp/dataset_with_clusters.csv"
gcp_project = "training-datasets-284316"
gcp_bucket_name = "cs-dataset-classifier"

config_file = download_file_from_gcp(
    source_file = path_to_config, 
    destination_folder = ".", 
    gcp_project=gcp_project, 
    gcp_bucket_name=gcp_bucket_name) 

#Reading CSV and Prepration

In [20]:
df = pd.read_csv("dataset_with_clusters.csv")
df = df[['abstract','cluster_label']]
df.columns = ["text", "label"]
df = df.replace(np.nan, "")
df["text"] = df["text"].apply(lambda x: x.strip())
df["label"] = df["label"].apply(lambda x: "cluster_"+str(x+1))
df = df.sample(frac=1)[:SAMPLE_NUM].reset_index(drop=True)

df

Unnamed: 0,text,label
0,In Split Supersymmetry scenarios the possibili...,cluster_19
1,We present a complete microscopic Fermi-liquid...,cluster_9
2,The paper determines the anomalous magnetic mo...,cluster_9
3,The superspace formulation of N=1 conformal su...,cluster_2
4,"Here, we demonstrate that polarization propert...",cluster_7
...,...,...
49995,We measure the electrical resistance of o-TaS3...,cluster_3
49996,We study a two-dimensional two-component Fermi...,cluster_9
49997,We propose a general mechanism for renormaliza...,cluster_12
49998,We construct a local on-shell invariant in D=1...,cluster_2


#

#Creating Classes

In [26]:
classes_names = list(df["label"].unique())
classes_dict = dict(enumerate(classes_names))
classes_names = list(classes_names)

features = Features({"text": Value("string"), "label": ClassLabel(num_classes=len(classes_names), names=classes_names)})

features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['cluster_19', 'cluster_9', 'cluster_2', 'cluster_7', 'cluster_8', 'cluster_15', 'cluster_5', 'cluster_20', 'cluster_3', 'cluster_18', 'cluster_11', 'cluster_6', 'cluster_13', 'cluster_17', 'cluster_14', 'cluster_1', 'cluster_4', 'cluster_12', 'cluster_16', 'cluster_10'], id=None)}

#Dataset Divition

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size = TEST_SIZE)

train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

data_files = {"train": "train.csv", "test":"test.csv"}
datasets = load_dataset("csv", data_files=data_files, features=features)

datasets

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-38fe106e88f767c5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-38fe106e88f767c5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

#Tokenization

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
if tokenizer.is_fast:
    print("Tokenizer is Fast")

Tokenizer is Fast


In [29]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [30]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [36]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

#Data Collator and Model Loading

In [43]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
accuracy = evaluate.load("accuracy")

batches_per_epoch = len(tokenized_datasets["train"]) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * NUM_EPOCHS)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

id2label = {k:v for k,v in classes_dict.items()}
label2id = {v:k for k,v in classes_dict.items()}

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label2id), id2label=id2label, label2id=label2id
)

#Dataset Prepration and Compiling

In [50]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_datasets["train"],
    shuffle=True,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_datasets["test"],
    shuffle=False,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator,
)


model.compile(optimizer=optimizer)
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
callbacks = [metric_callback]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


#Training the Model

In [55]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=NUM_EPOCHS, callbacks=callbacks)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f44342d0be0>