In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/news_tagging_model
# !mkdir news_tagging_model

/content/drive/MyDrive/news_tagging_model


In [3]:
# %cd news_tagging_model

In [4]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 29.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 7.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [5]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 31.8 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [6]:
import pandas as pd
df = pd.read_csv("prepared_data_selected_12_categories.csv")
df.category.value_counts()

news            80576
markets         45432
india           42177
sports          32252
world           30025
                ...  
tech-tips         266
cars              235
travel            152
brands            105
auto-special      103
Name: category, Length: 61, dtype: int64

## Run 2nd time

In [None]:
df

In [None]:
df_health = df[df['category'] == 'health']
df_health
df_not_health = df[df['category'] != 'health']
df_not_health_old = df_not_health.sample(n=len(df_health),random_state=1)
df_not_health = df_not_health[~df_not_health['Unnamed: 0'].isin(df_not_health_old['Unnamed: 0'].values)]
df_not_new = df_not_health.sample(n=len(df_health),random_state=1)
df_balanced = df_not_new.append(df_health)
df_balanced.to_csv("balanced.csv")

In [None]:
# df_balanced = df_not_health.append(df_health)
# df_balanced.to_csv("balanced.csv")

## Run training first time

In [14]:
df_health = df[df['category'] == 'health']
df_health
df_not_health = df[df['category'] != 'health']
df_not_health = df_not_health.sample(n=len(df_health),random_state=10)
df_balanced = df_not_health.append(df_health)
df_balanced.to_csv("balanced.csv")

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

import torch
from torch.utils.data import Dataset
# from omegaconf import DictConfig, OmegaConf
# import hydra
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
# import wandb
import os
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [16]:
# print("Loading Model...")
# model = AutoModelForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels = 21)
# print("Loading Tokenizer...")
# tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")

In [17]:
def compute_metrics(outs):
    predictions, labels = outs
    predictions = np.argmax(predictions, axis = -1)

    ## computes overall scores (accuracy, f1, recall, precision)
    accuracy = accuracy_score(labels, predictions) * 100
    f1 = f1_score(labels, predictions, average = "macro") * 100
    recall = recall_score(labels, predictions, average = "macro") * 100
    precision = precision_score(labels, predictions, average = "macro") * 100

    return {
        "accuracy" : float(accuracy),
        "f1" : float(f1),
        "recall" : float(recall),
        "precision" : float(precision),
    }



In [18]:
def encode_labels(labels):
  labels_set = set(labels)
  endcoded_labels = labels
  # counter = 0
  # for current_label in labels_set:
  for j in range(len(endcoded_labels)):
    # print(endcoded_labels[j] )
    if endcoded_labels[j] == 'health':
      endcoded_labels[j] = 1
    else:
      endcoded_labels[j] = 0
      # if endcoded_labels[j] == current_label:
      #   endcoded_labels[j] = counter
    # counter+=1
  return endcoded_labels

encode_labels(["x","health","y","x","z"])

[0, 1, 0, 0, 0]

In [19]:
def load_data(path):
    """
    read CSV file and return the tweets and labels lists
    """
    df = pd.read_csv(path)
    titles = df['title'].tolist()
    labels = encode_labels(df['category'].tolist())
    print("max(labels)")

    print(max(labels))
    return titles, labels

In [20]:
class MultiDialectDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [21]:
#load model and tokenizer
print("Loading Model...")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels = 21)
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")

# #freeze the model weights
# if cfg.freeze_all_model:
#   for p in  model.bert.parameters():
#       p.requires_grad = False

# elif cfg.freeze_only_embeddings:
#   for p in  model.bert.embeddings.parameters(): #freeze only the embeddings
#     p.requires_grad = False

#prepare data and dataset
print("Preparing Data...")
train_all_tweets, train_all_labels = load_data("balanced.csv")
test_tweets, test_labels = load_data("balanced.csv")


Loading Model...


Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/534M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Loading Tokenizer...


Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Preparing Data...
max(labels)
1
max(labels)
1


In [22]:
from sklearn.utils import class_weight
class_weights = dict(enumerate(class_weight.compute_class_weight('balanced',
                                                         classes=np.unique(train_all_labels),
                                                         y=train_all_labels)))
class_weights

{0: 1.0, 1: 1.0}

In [23]:
list(class_weights.values())

[1.0, 1.0]

In [24]:
# from torch import nn
# from transformers import Trainer

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# #device = 'cpu'
# device
# #class weights

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.get("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         # compute custom loss (suppose one has 3 labels with different weights)
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([0.5040613531845143, 62.05583832335329])).to(device)
#         loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)).to(device)
#         return (loss, outputs) if return_outputs else loss

In [None]:
#load model and tokenizer
print("Loading Model...")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels = 2)
print("Loading Tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base")

# #freeze the model weights
# if cfg.freeze_all_model:
#   for p in  model.bert.parameters():
#       p.requires_grad = False

# elif cfg.freeze_only_embeddings:
#   for p in  model.bert.embeddings.parameters(): #freeze only the embeddings
#     p.requires_grad = False

#prepare data and dataset
# print("Preparing Data...")
# train_all_tweets, train_all_labels = load_data("/content/balanced.csv")
# test_tweets, test_labels = load_data("/content/balanced.csv")

#split the train_all to train and validation
train_tweets, val_tweets, train_labels, val_labels = train_test_split(
    train_all_tweets,
    train_all_labels,
    test_size=.25,
    random_state= 5)

#tokenize the data
print("Tokenizeing the inputs...")
train_encodings = tokenizer(train_tweets,
                            truncation=True,
                            padding=True,
                            #max_length=model.config.max_position_embeddings
                            )
val_encodings = tokenizer(val_tweets,
                          truncation=True,
                          padding=True,
                            #max_length=model.config.max_position_embeddings
                          )
# test_encodings = tokenizer(test_tweets,
#                               truncation=True,
#                               padding=True,
#                               #max_length=model.config.max_position_embeddings
#                               )


#creat the datasets
train_ds = MultiDialectDataset(train_encodings, train_labels)
val_ds = MultiDialectDataset(val_encodings, val_labels)
# test_ds = MultiDialectDataset(test_encodings, test_labels)

#training
print("Start the training...")
# training_args = TrainingArguments(
#     output_dir='./results',          # output directory
#     num_train_epochs=3,              # total number of training epochs
#     per_device_train_batch_size=8,  # batch size per device during training
#     per_device_eval_batch_size=20,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir='./logs',            # directory for storing logs
#     load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
#     # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
#     logging_steps=200,               # log & save weights each logging_steps
#     save_steps=200,
#     evaluation_strategy="steps",     # evaluate each `logging_steps`
# )
training_args = TrainingArguments(  
    output_dir = "./finetune_out/",
    overwrite_output_dir =  True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 8,
    learning_rate = 5e-5,
    dataloader_drop_last= True,
    load_best_model_at_end= True,
    metric_for_best_model = "eval_f1",
    greater_is_better = True,
    num_train_epochs= 10)

trainer = Trainer(
model=model,                         
args=training_args,                 
train_dataset=train_ds,         
eval_dataset=val_ds,
compute_metrics=compute_metrics        
)

# #Start Wandb
# if cfg.trainer_args.report_to == 'wandb':
#     wandb.init(**cfg.wandb)

trainer.train()
val_pred = trainer.predict(val_ds)

val_metrics = trainer.evaluate(val_ds)
# wandb.log(val_metrics)
print("val_metrics")
print(val_metrics)

# test_preds, test_out_ids, test_metrics = trainer.predict(test_ds)
# # wandb.log(test_metrics)
# print("test_metrics")
# print(test_metrics)
model.save_pretrained(os.path.join("./finetune_out/", 'best_ckpt'))
tokenizer.save_pretrained(os.path.join("./finetune_out/", 'best_ckpt'))

#Finish Wandb
# if cfg.trainer_args.report_to == 'wandb':
#     wandb.finish()




loading configuration file https://huggingface.co/microsoft/mdeberta-v3-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/f65be399a70f256c97e2aa482762ad759aba17bc3e8061b0b182aa50cf4eb944.705d92ced627aac61154b0297e5afed9e440f848301a597859fad2335fb2739d
Model config DebertaV2Config {
  "_name_or_path": "microsoft/mdeberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_at

Loading Model...


Some weights of the model checkpoint at microsoft/mdeberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Loading Tokenizer...


loading file https://huggingface.co/microsoft/mdeberta-v3-base/resolve/main/spm.model from cache at /root/.cache/huggingface/transformers/ac0b421fe9b4e93a88eec076f0f30566636e51fcb382d9b1a734b3a8d3a1b8cb.a89eb3c0add0e1b04b46be11a1bc1a65b92fdae1bbb04124701ff2e6acfccc75
loading file https://huggingface.co/microsoft/mdeberta-v3-base/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/microsoft/mdeberta-v3-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/microsoft/mdeberta-v3-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/microsoft/mdeberta-v3-base/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/ac241776e043e9e864d27f9028ec7d7be9ff95a3ba243506edc8f02ccbdddcb3.df5a7f41459442f66bec27ac9352bba694cde109855024b3ae61be2f5734ee9a
loading configuration file https://huggingface.co/microsoft/mdeberta-v3-base/resolve/main/config.json from cac

Tokenizeing the inputs...


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Start the training...


***** Running training *****
  Num examples = 5010
  Num Epochs = 10
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 780


In [None]:
# # test_preds, test_out_ids, test_metrics = trainer.predict(val_ds)
# test_preds, test_out_ids, test_metrics = val_pred

In [None]:
# test_metrics

In [None]:
# print(confusion_matrix)
# confusion_matrix

In [None]:
# val_pred

In [None]:
# from sklearn import metrics
# confusion_matrix = metrics.confusion_matrix(val_pred, val_pred,y_type="binary", labels=[0, 1]) #labels=["neg", "pos"])
