In [1]:
import sys
print("Python version")
print (sys.version)
print("Version info.")
print (sys.version_info)

Python version
3.9.12 (main, Jun  1 2022, 11:38:51) 
[GCC 7.5.0]
Version info.
sys.version_info(major=3, minor=9, micro=12, releaselevel='final', serial=0)


In [2]:
import torch, os
import pandas as pd
import transformers
from tqdm import tqdm
from transformers import AutoModel
from transformers import AutoConfig
from transformers import BertTokenizerFast
from SL_utils import *

os.environ["CUDA_VISIBLE_DEVICES"] = str(5)

In [3]:
Coding_emotions = {
    "AN": "Anger",
    "AP": "Apprehension",
    "SD": "Sadness",
    "CO": "Confusion",
    "HA": "Happiness",
}

emotions_list = list(Coding_emotions.keys())

test_sentences = [
    "In my dream I was follwed by the scary monster.",
    "I was walking in a forest, sorrounded by singing birds. I was calm and at peace."
]

test_sentences_target = len(test_sentences)*[[0, 0, 0, 0, 0]]
test_sentences_df     = pd.DataFrame.from_dict(
                {
                "report":test_sentences,
                "Report_as_Multilabel":test_sentences_target
                }
)

In [4]:
test_sentences_df

Unnamed: 0,report,Report_as_Multilabel
0,In my dream I was follwed by the scary monster.,"[0, 0, 0, 0, 0]"
1,"I was walking in a forest, sorrounded by singi...","[0, 0, 0, 0, 0]"


# Main Model 

In [5]:
model_name   = "bert-large-cased"
model_config = AutoConfig.from_pretrained(model_name)
tokenizer    = BertTokenizerFast.from_pretrained(model_name, do_lower_case=False)
testing_set  = CustomDataset(test_sentences_df, tokenizer, max_length=512)

test_params = {
    'batch_size': 2,
    'shuffle': True,
    'num_workers': 0
}

testing_loader  = DataLoader(testing_set, **test_params)

model = BERT_PTM(
    model_config,
    model_name=model_name, 
    n_classes=len(emotions_list), 
    freeze_BERT=False,
)

model.load_state_dict(torch.load("model/pytorch_model.bin"))
model.to("cuda")

print("Collecting Predictions")

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Collecting Predictions


In [6]:
outputs, targets, ids = validation(model, testing_loader, device="cuda", return_inputs=True)

corr_outputs    = np.array(outputs) >= 0.5 
corr_outputs_df = pd.DataFrame(corr_outputs, columns=emotions_list)
corr_outputs_df = corr_outputs_df.astype(int)

corr_outputs_df["report"] = decoded_ids = [decode_clean(x, tokenizer) for x in tqdm(ids)]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 863.11it/s]


In [9]:
corr_outputs_df

Unnamed: 0,AN,AP,SD,CO,HA,report
0,0,1,0,0,0,In my dream I was follwed by the scary monste...
1,0,0,0,0,1,"I was walking in a forest, sorrounded by sing..."


# 🤗 Models

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "DReAMy-lib/bert-base-cased-DreamBank-emotion-presence"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModelForSequenceClassification.from_pretrained(model_name)


In [15]:
from transformers import pipeline

test_dreams = [
    "In my dream, I was followed by the scary monster.",
    "I was walking in a forest, surrounded by singing birds. I was calm and at peace.",
    "I dreamed that a my mother was giving me a sad news, but I was confused by it. "
]

classifier = pipeline(
    task="text-classification", 
    model=model, 
    tokenizer=tokenizer,
    top_k=None,
)

predictions = classifier(test_dreams)

In [16]:
predictions

[[{'label': 'AP', 'score': 0.8697441816329956},
  {'label': 'CO', 'score': 0.1245221346616745},
  {'label': 'HA', 'score': 0.025534192100167274},
  {'label': 'AN', 'score': 0.015074575319886208},
  {'label': 'SD', 'score': 0.010451494716107845}],
 [{'label': 'HA', 'score': 0.9519748091697693},
  {'label': 'AP', 'score': 0.07662183046340942},
  {'label': 'SD', 'score': 0.042797815054655075},
  {'label': 'CO', 'score': 0.02953989803791046},
  {'label': 'AN', 'score': 0.008983743377029896}],
 [{'label': 'CO', 'score': 0.9686605334281921},
  {'label': 'SD', 'score': 0.4479924738407135},
  {'label': 'AP', 'score': 0.06535966694355011},
  {'label': 'HA', 'score': 0.03700108453631401},
  {'label': 'AN', 'score': 0.015028676018118858}]]