# Imports and inits

In [1]:
'''Installations'''

!pip install evaluate
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.7.0
  Downloading huggingface_hub-0.13

In [2]:
import numpy as np
import pandas as pd
import evaluate
import transformers
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, TextClassificationPipeline 

In [3]:
'''For Google colab '''

# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
path = 'wantwords&opted/wantwords&opted.csv'

# Data init

In [5]:
data = pd.read_csv(path, dtype="string")
data['Definition'] = data['Definition'].astype(str)

In [6]:
data = data[:100]

In [7]:
'''Convert classes to numbers'''
word_dict = {} 
i = 0
for w in data['Word'].unique():
    word_dict[w] = i
    i += 1

'''Convert numbers back to words'''
idx2word = {v:k for k,v in word_dict.items()}

In [8]:
df_train, df_test = train_test_split(data[['Definition','Word']], test_size=0.2)

# Model and tokenizer init

In [9]:
nli_model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli', num_labels = len(word_dict), ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([10, 1024]) in the model instantiated
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

# Train and test dataset preperation

In [10]:
train_enc = tokenizer(df_train['Definition'].to_list(), truncation=True, padding=True)


In [11]:
test_enc = tokenizer(df_test['Definition'].to_list(), truncation=True, padding=True)

In [12]:
df_train['Word'] = df_train['Word'].apply(lambda x: word_dict[x])

In [13]:
df_train['Word'].unique()

array([0, 4, 9, 5, 2, 3, 8, 7, 6, 1])

In [14]:
class RevDictDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.labels = self.labels.to_list()
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

In [15]:
'''One hot encoding of classes'''
label_enum = {k:j for j, k in enumerate(df_train['Word'].unique())}
num_labels = len(label_enum)
df_train['labels'] = df_train['Word'].apply(lambda x: [1.0 if label_enum[x]==i else 0.0 for i in range(num_labels)])

In [16]:
train_dataset = RevDictDataset(train_enc, df_train['labels'])

In [17]:
df_train['labels'].iloc[1]

[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

# Model training

In [18]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10
)
nli_model.config.problem_type = 'multi_label_classification'

In [19]:
trainer = Trainer(
    model=nli_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset           # evaluation dataset
)


In [20]:
trainer.train()



Step,Training Loss
10,0.7068
20,0.6293
30,0.5231
40,0.3843
50,0.2832
60,0.2739


TrainOutput(global_step=60, training_loss=0.4667580445607503, metrics={'train_runtime': 24.0636, 'train_samples_per_second': 9.974, 'train_steps_per_second': 2.493, 'total_flos': 29038155531840.0, 'train_loss': 0.4667580445607503, 'epoch': 3.0})

# Model Inference

In [26]:
pipe = TextClassificationPipeline(model=nli_model, tokenizer=tokenizer, return_all_scores=True, device=0)



In [27]:
'''Inference on a single sentence'''

output = pipe(['too much'])

In [28]:
scores = [a['score'] for a in output[0]]

In [29]:
pred_scores = [(scores[i] ,idx2word[i]) for i in range(len(scores))]
pred_scores.sort(reverse=True)
pred_scores

[(0.44116687774658203, 'pardoned'),
 (0.3226940929889679, 'berried'),
 (0.21313296258449554, 'sodomize'),
 (0.20543833076953888, 'excess'),
 (0.16950136423110962, 'roadrunner'),
 (0.154733806848526, 'trichromatic'),
 (0.13076969981193542, 'deaminase'),
 (0.11031726747751236, 'toppings'),
 (0.08632007241249084, 'fingerpicking'),
 (0.061041492968797684, 'luxuriousness')]