# Load Data

In [None]:
import pandas as pd
train_data = pd.read_excel("/content/question_builder.xlsx", header = 1)[["Regquestion2", "Category"]].iloc[0:58]

In [None]:
train_data

Unnamed: 0,Regquestion2,Category
0,Where was _person_ born?,0
1,Show me photos of _person_.,2
2,Show me horror movies like _movie_.,1
3,What are some drama movies such as _movie_?,1
4,Show me images of the cast of _movie_,0
5,Show me movies like _,1
6,Show me movies similar to _movie_,1
7,What is the occupation of_person_?,0
8,What is the genre of _movie_?,3
9,Who is the director of _movie_?,3


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(train_data,  test_size=0.33, random_state=42)

In [None]:
from transformers import BertTokenizer
from datasets import Dataset
training_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_test)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

encoded_dataset = [tokenizer(item['Regquestion2'], return_tensors="pt", padding='max_length', truncation=True, max_length=128) for item in training_data]
import torch
for enc_item, item in zip(encoded_dataset, training_data):
    enc_item['labels'] = torch.LongTensor([item['Category']])

eval_dataset = [tokenizer(item['Regquestion2'], return_tensors="pt", padding='max_length', truncation=True, max_length=128) for item in eval_data]
import torch
for enc_item, item in zip(eval_dataset, eval_data):
    enc_item['labels'] = torch.LongTensor([item['Category']])
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)
training_args = TrainingArguments(
    num_train_epochs=10,
    per_device_train_batch_size=4,
    output_dir='class-sent',
    logging_dir='logs',
    no_cuda=False,  # defaults to false anyway, just to be explicit
)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embedd

# Run Model

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=encoded_dataset,
    eval_dataset=eval_dataset
)

In [None]:
for item in encoded_dataset:
    for key in item:
        item[key] = torch.squeeze(item[key])
for item in eval_dataset:
    for key in item:
        item[key] = torch.squeeze(item[key])

In [None]:
trainer.train()

***** Running training *****
  Num examples = 38
  Num Epochs = 10
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 108314117
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=100, training_loss=0.6328202056884765, metrics={'train_runtime': 12.1241, 'train_samples_per_second': 31.342, 'train_steps_per_second': 8.248, 'total_flos': 24996223534080.0, 'train_loss': 0.6328202056884765, 'epoch': 10.0})

# Pushing to HuggingFace

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
trainer.push_to_hub("mkorob/classifier-model-v3")

Cloning https://huggingface.co/mkorob/class-sent into local empty directory.
Saving model checkpoint to class-sent
Configuration saved in class-sent/config.json
Model weights saved in class-sent/pytorch_model.bin
tokenizer config file saved in class-sent/tokenizer_config.json
Special tokens file saved in class-sent/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 3.30k/413M [00:00<?, ?B/s]

Upload file training_args.bin: 100%|#########9| 3.30k/3.31k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mkorob/class-sent
   acb16cb..d78ddbf  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/mkorob/class-sent
   acb16cb..d78ddbf  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Text Classification', 'type': 'text-classification'}}
To https://huggingface.co/mkorob/class-sent
   d78ddbf..5ab258d  main -> main

   d78ddbf..5ab258d  main -> main



'https://huggingface.co/mkorob/class-sent/commit/d78ddbf60e895119e5412aaa9d880530e84b145f'

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
tokenizerClass = AutoTokenizer.from_pretrained("mkorob/results")
modelClass = AutoModelForSequenceClassification.from_pretrained("mkorob/results")
classSent = pipeline("text-classification", model=modelClass, tokenizer=tokenizerClass)

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--mkorob--results/snapshots/8b1e02b765fbe34f3c23c45f363055355444b280/vocab.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--mkorob--results/snapshots/8b1e02b765fbe34f3c23c45f363055355444b280/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--mkorob--results/snapshots/8b1e02b765fbe34f3c23c45f363055355444b280/tokenizer_config.json


Downloading:   0%|          | 0.00/955 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--mkorob--results/snapshots/8b1e02b765fbe34f3c23c45f363055355444b280/config.json
Model config BertConfig {
  "_name_or_path": "mkorob/results",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_cla

Downloading:   0%|          | 0.00/433M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--mkorob--results/snapshots/8b1e02b765fbe34f3c23c45f363055355444b280/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at mkorob/results.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


# Testing Predictions

In [None]:
int(classSent("show me Angelina Jolie")[0]['label'][6])

2