In [None]:
!pip install pandas==1.3.4
!pip install transformers==4.12.5
!pip install datasets==1.15.1
!pip install ipywidgets

In [None]:
import os
import pickle

from collections import Counter

# import pandas as pd
from sklearn.metrics import classification_report

import numpy as np
import torch
import torch.nn as nn

import transformers
from transformers import Trainer
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding

import datasets
from datasets import Dataset
from datasets import ClassLabel
from datasets import load_metric

In [3]:
dataset_mlc = torch.load(os.path.join("/notebooks/Notebooks/general_notebooks/datasets", 'pe_dataset_for_MLC.pt'))

In [4]:
dataset_mlc

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'essay_nr', 'starting_idx', 'component_label', 'ending_idx', 'text', 'link_label', 'split', 'essay', 'argument_bound_1', 'argument_bound_2', 'argument_id', 'label', 'structural_fts_as_text', 'joined_label', 'strct_fts_w_linked', 'mc', 'cl', 'prem', 'link'],
        num_rows: 3769
    })
    test: Dataset({
        features: ['Unnamed: 0', 'essay_nr', 'starting_idx', 'component_label', 'ending_idx', 'text', 'link_label', 'split', 'essay', 'argument_bound_1', 'argument_bound_2', 'argument_id', 'label', 'structural_fts_as_text', 'joined_label', 'strct_fts_w_linked', 'mc', 'cl', 'prem', 'link'],
        num_rows: 1138
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'essay_nr', 'starting_idx', 'component_label', 'ending_idx', 'text', 'link_label', 'split', 'essay', 'argument_bound_1', 'argument_bound_2', 'argument_id', 'label', 'structural_fts_as_text', 'joined_label', 'strct_fts_w_linked', 'mc', 'cl', 'pr

In [5]:
# what do we need: text 'mc', 'cl', 'prem', 'link'
# that's enough

In [6]:
dataset_mlc = dataset_mlc.remove_columns(['Unnamed: 0', 'essay_nr', 'starting_idx', 'component_label', 'ending_idx', 'link_label', 'split', 'essay', 'argument_bound_1', 'argument_bound_2', 'argument_id', 'label', 'joined_label', 'strct_fts_w_linked'])

In [7]:
dataset_mlc

DatasetDict({
    train: Dataset({
        features: ['text', 'structural_fts_as_text', 'mc', 'cl', 'prem', 'link'],
        num_rows: 3769
    })
    test: Dataset({
        features: ['text', 'structural_fts_as_text', 'mc', 'cl', 'prem', 'link'],
        num_rows: 1138
    })
    validation: Dataset({
        features: ['text', 'structural_fts_as_text', 'mc', 'cl', 'prem', 'link'],
        num_rows: 943
    })
})

In [8]:
dataset_mlc['train'][100]

{'text': 'criminal minds that breed violence not stringent gun control',
 'structural_fts_as_text': 'Topic: Gun control and increasing violence, Sentence: To some extent, I do not agree with this assertion because I believe that criminal minds that breed violence not stringent gun control, Para Number: 1, First in Para: Yes, Last in Para: Yes, Is in Introduction: Yes, Is in Conclusion: No',
 'mc': 1,
 'cl': 0,
 'prem': 0,
 'link': 0}

In [9]:
labels = [label for label in dataset_mlc['train'].features.keys() if label not in ['text', 'structural_fts_as_text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['mc', 'cl', 'prem', 'link']

In [10]:
import transformers
#from transformers import Trainer
from transformers import AutoTokenizer
#from transformers import BertForSequenceClassification
#from transformers import Trainer, TrainingArguments
#from transformers.data.data_collator import DataCollatorWithPadding

In [11]:
#from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # take a batch of texts
    text = examples["structural_fts_as_text"]
    # encode them
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=256)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

        encoding["labels"] = labels_matrix.tolist()

    return encoding

In [12]:
encoded_dataset = dataset_mlc.map(preprocess_data, batched=True, remove_columns=dataset_mlc['train'].column_names)



  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [13]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['attention_mask', 'input_ids', 'labels', 'token_type_ids'])


In [14]:
tokenizer.decode(example['input_ids'])

'[CLS] topic : gender equality at university admission, sentence : they want female candidates for soft natured work like counseling, teaching, designing etc, para number : 3, first in para : no, last in para : no, is in introduction : no, is in conclusion : no [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [15]:
example['labels']

[0.0, 0.0, 1.0, 0.0]

In [16]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['prem']

In [17]:
encoded_dataset.set_format("torch")

In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [21]:
device

device(type='cuda')

In [22]:
batch_size = 16
metric_name = "f1"
nr_epochs = 6
results_folder = "/notebooks/Notebooks/general_notebooks/bert-finetuned-pe-mlc"

In [23]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    
    output_dir=results_folder,          
    
    # params
    num_train_epochs=nr_epochs,               # nb of epochs
    per_device_train_batch_size=batch_size,   # batch size per device during training
    per_device_eval_batch_size=batch_size,    # cf. paper Sun et al.
    learning_rate=1e-5,#2e-5,                 # cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
    # eval
    evaluation_strategy="steps",              # cf. paper Sun et al.
    eval_steps=20,                            # cf. paper Sun et al.
    
    # log
#     logging_dir="/notebooks/Results/bert_sequence_classification/tb_logs",  
#     logging_strategy='steps',
#     logging_steps=20,
    
    # save
    save_strategy='steps',
    save_total_limit=2,
    # save_steps=20, # default 500
    load_best_model_at_end=True,              # cf. paper Sun et al.
    # metric_for_best_model='eval_loss' 
    metric_for_best_model=metric_name
    
    
    
#     f"bert-finetuned-pe-mlc",
#     evaluation_strategy = "epoch",
#     save_strategy = "epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=5,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model=metric_name,
#     #push_to_hub=True,
)

In [24]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_macro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [25]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [26]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  8476,  1024,  5907,  9945,  2012,  2118,  9634,  1010,  6251,
         1024,  2027,  2215,  2931,  5347,  2005,  3730,  3267,  2094,  2147,
         2066, 17041,  1010,  4252,  1010, 12697,  4385,  1010, 11498,  2193,
         1024,  1017,  1010,  2034,  1999, 11498,  1024,  2053,  1010,  2197,
         1999, 11498,  1024,  2053,  1010,  2003,  1999,  4955,  1024,  2053,
         1010,  2003,  1999,  7091,  1024,  2053,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [27]:
#forward pass
# outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
# outputs

In [28]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [29]:
trainer.train()

***** Running training *****
  Num examples = 3769
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1416


Step,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
20,No log,0.663298,0.291411,0.605123,0.183457
40,No log,0.605629,0.197342,0.695544,0.559915
60,No log,0.553885,0.19501,0.692099,0.559915
80,No log,0.523985,0.193433,0.68985,0.560976
100,No log,0.518359,0.193559,0.690041,0.560976
120,No log,0.489292,0.210887,0.71253,0.551432
140,No log,0.454917,0.222187,0.713965,0.54825
160,No log,0.43931,0.218878,0.699921,0.481442
180,No log,0.427837,0.251668,0.693655,0.484624
200,No log,0.416832,0.256777,0.700316,0.505832


***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evaluation *****
  Num examples = 943
  Batch size = 16
***** Running Evalua

TrainOutput(global_step=1416, training_loss=0.34909905686890336, metrics={'train_runtime': 1444.8202, 'train_samples_per_second': 15.652, 'train_steps_per_second': 0.98, 'total_flos': 2975050125545472.0, 'train_loss': 0.34909905686890336, 'epoch': 6.0})

In [30]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 943
  Batch size = 16


{'eval_loss': 0.32787325978279114,
 'eval_f1': 0.7149863383079997,
 'eval_roc_auc': 0.8226943285631305,
 'eval_accuracy': 0.7104984093319194,
 'eval_runtime': 10.2099,
 'eval_samples_per_second': 92.361,
 'eval_steps_per_second': 5.779,
 'epoch': 6.0}

### inference on the test set

In [33]:
dataset_mlc['test'][500]

{'text': 'Using public transportation has a lot of advantages for the modern society facing a lot of problems: the environmental population, the isolation in life, the depletion of natural resources',
 'structural_fts_as_text': 'Topic: Public transportation keeps society from the depletion of natural resources, Sentence: Using public transportation has a lot of advantages for the modern society facing a lot of problems: the environmental population, the isolation in life, the depletion of natural resources, Para Number: 5, First in Para: Yes, Last in Para: Yes, Is in Introduction: No, Is in Conclusion: Yes',
 'mc': 1,
 'cl': 0,
 'prem': 0,
 'link': 0}

In [34]:
text = "Using public transportation has a lot of advantages for the modern society facing a lot of problems: the environmental population, the isolation in life, the depletion of natural resources"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [35]:
logits = outputs.logits
logits.shape

torch.Size([1, 4])

In [36]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

[]


In [31]:
from transformers.data.data_collator import DataCollatorWithPadding

In [32]:
test_trainer = Trainer(model, data_collator=DataCollatorWithPadding(tokenizer))
test_raw_preds, test_labels, _ = test_trainer.predict(encoded_dataset["test"])
# test_preds = np.argmax(test_raw_preds, axis=0)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 1138
  Batch size = 8


In [33]:
test_raw_preds.shape

(1138, 4)

In [34]:
test_labels

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [36]:
sigmoid = torch.nn.Sigmoid()

In [58]:
test_raw_preds[100]

NameError: name 'test_raw_preds' is not defined

In [43]:
try_pred = sigmoid(torch.tensor(test_raw_preds[0]))

In [44]:
try_pred

tensor([0.0738, 0.6383, 0.1192, 0.1353])

In [37]:
test_preds = sigmoid(torch.tensor(test_raw_preds))

In [38]:
test_preds

tensor([[0.0807, 0.6484, 0.1614, 0.2986],
        [0.7678, 0.2760, 0.0419, 0.0443],
        [0.0057, 0.1876, 0.8571, 0.3283],
        ...,
        [0.0104, 0.2795, 0.7419, 0.3969],
        [0.0072, 0.1583, 0.8489, 0.2116],
        [0.9015, 0.1502, 0.0491, 0.0482]])

In [39]:
# nice! now do the thresholding

In [40]:
predictions = np.zeros(test_preds.shape)
predictions[np.where(test_preds >= 0.5)] = 1
# turn predicted id's into actual label names
#predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
#print(predicted_labels)

In [41]:
predictions

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [42]:
from sklearn.metrics import classification_report

In [43]:
print(classification_report(test_labels, predictions, digits=3))

              precision    recall  f1-score   support

           0      0.763     0.964     0.852       140
           1      0.666     0.709     0.687       275
           2      0.944     0.869     0.905       723
           3      0.607     0.596     0.601       267

   micro avg      0.800     0.795     0.797      1405
   macro avg      0.745     0.784     0.761      1405
weighted avg      0.808     0.795     0.799      1405
 samples avg      0.832     0.816     0.815      1405



  _warn_prf(average, modifier, msg_start, len(result))


## task: do separate classification reports for separate classes:

In [52]:
test_labels

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]], dtype=float32)

In [53]:
predictions

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [54]:
test_labels.shape

(1138, 4)

In [55]:
predictions.shape

(1138, 4)

In [77]:
# separate test labels into separate task classes

In [63]:
# first three classes

test_labels[:,0:3]

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [64]:
# linked/not_linked class

In [69]:
test_labels[:,3]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [72]:
test_labels[:,3].shape

(1138,)

In [44]:
test_labels_comp_classes = test_labels[:,0:3]

In [45]:
test_labels_link_class = test_labels[:,3]

In [46]:
test_labels_comp_classes.shape

(1138, 3)

In [47]:
test_labels_link_class.shape

(1138,)

In [None]:
# separate test predictions into separate task classes

In [78]:
predictions[:,0:3]

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [79]:
predictions[:,3]

array([0., 0., 0., ..., 0., 0., 0.])

In [48]:
test_predictions_comp_classes = predictions[:,0:3]

In [49]:
test_predictions_link_class = predictions[:,3]

In [82]:
test_predictions_comp_classes.shape

(1138, 3)

In [83]:
test_predictions_link_class.shape

(1138,)

In [84]:
# now do two separate classification reports

In [50]:
# for comp classes

print(classification_report(test_labels_comp_classes, test_predictions_comp_classes, digits=3))

              precision    recall  f1-score   support

           0      0.763     0.964     0.852       140
           1      0.666     0.709     0.687       275
           2      0.944     0.869     0.905       723

   micro avg      0.844     0.842     0.843      1138
   macro avg      0.791     0.847     0.814      1138
weighted avg      0.855     0.842     0.846      1138
 samples avg      0.839     0.842     0.840      1138



  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
# for link class

print(classification_report(test_labels_link_class, test_predictions_link_class, digits=3))

              precision    recall  f1-score   support

         0.0      0.877     0.882     0.879       871
         1.0      0.607     0.596     0.601       267

    accuracy                          0.815      1138
   macro avg      0.742     0.739     0.740      1138
weighted avg      0.813     0.815     0.814      1138



In [None]:
# on the combined strct fts

In [6]:
dataset_mlc = torch.load(os.path.join("/notebooks/Notebooks/general_notebooks/datasets", 'pe_dataset_for_MLC.pt'))

In [11]:
str_try = dataset_mlc['train']['structural_fts_as_text'][0]

In [12]:
str_try

'Topic: Gender Equality at university admission, Sentence: They want female candidates for soft natured work like counseling, teaching, designing etc, Para Number: 3, First in Para: No, Last in Para: No, Is in Introduction: No, Is in Conclusion: No'

In [20]:
para_number = str_try.split("Para Number: ")[1].split(",")[0]

In [21]:
para_number

'3'

In [26]:
first_in_para = str_try.split("Para Number: ")[1].split(",")[1].split(" ")[4]

In [27]:
first_in_para

'No'

In [31]:
last_in_para = str_try.split("Para Number: ")[1].split(",")[2].split(" ")[4]

In [32]:
last_in_para

'No'

In [35]:
is_in_intro = str_try.split("Para Number: ")[1].split(",")[3].split(" ")[4]

In [36]:
is_in_intro

'No'

In [39]:
is_in_concl = str_try.split("Para Number: ")[1].split(",")[4].split(" ")[4]

In [40]:
is_in_concl

'No'

In [41]:
# ok nice. now concat them to get the combined thing.
# see how to do the new dataset column thing in hugging face
# then do it for this dataset
# then run it on this new combined struct fts sentence representation

In [42]:
dataset_mlc_old = torch.load(os.path.join("/notebooks/ICANN/Datasets", 'dataset_persuasive_essays_icann.pt'))

In [47]:
dataset_mlc_old['train']['topic_full_sentence_structural_fts_combined'][10]

"Topic: Should students be taught to compete or to cooperate? Sentence: Consequently, no matter from the view of individual development or the relationship between competition and cooperation we can receive the same conclusion that a more cooperative attitudes towards life is more profitable in one's success. Structural features: Four. No. Yes. Yes. Yes."

In [48]:
!pip install inflect

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting inflect
  Downloading inflect-5.6.0-py3-none-any.whl (33 kB)
Installing collected packages: inflect
Successfully installed inflect-5.6.0


In [49]:
import inflect

In [51]:
inflect.engine().number_to_words(45)

'forty-five'

In [56]:
new_str = "Structural Features: " + inflect.engine().number_to_words(para_number).title() + ". " + first_in_para + ". " + last_in_para + ". " + is_in_intro + ". " + is_in_concl

In [57]:
new_str

'Structural Features: Three. No. No. No. No'

In [None]:
# create this combined column
# then, run the same joint business w 48 batchsize, epoch = 6, learning_rate = 1e-5
# check task 3.1 from that other notebook