In [1]:
!nvidia-smi

Fri Dec 16 12:13:01 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:03:00.0 Off |                  N/A |
| 30%   37C    P0   107W / 350W |      0MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:04:00.0 Off |                  N/A |
| 30%   34C    P0   109W / 350W |      0MiB / 24576MiB |      0%      Defaul

In [1]:
import os

cuda_device = 0
os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device)

import random, torch, itertools

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Device: {}".format(device))

import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
from nltk.tokenize import word_tokenize

from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

Device: cuda


In [2]:
Coding_emotions = {
    "AN": "anger",
    "AP": "apprehension",
    "SD": "sadness",
    "CO": "confusion",
    "HA": "happiness",
}

Coding_numerosity = {
    1: "one",
    2: "two",
    3: "three",
    4: "four",
    5: "five",
    6: "six",
    7: "seven",
    8: "eight",
}

Coding_number = {
    "1": "individual",
    "2": "group",
    "3": "individual dead",
    "4": "group dead",
    "5": "individual imaginary",
    "6": "group imaginary",
    "7": "original form",
    "8": "changed form",
}

Coding_gendr = {
    "F": "female",
    "M": "male",
    "J": "joint",
    "I": "indefinite",
}

Coding_identity = {
    "F": "father",
    "M": "mother",
    "X": "parents",
    "B": "brother",
    "T": "sister",
    "H": "husband",
    "W": "wife",
    "A": "son",
    "D": "daughter",
    "C": "child",
    "I": "infant",
    "Y": "family member",
    "R": "reative",
    "K": "known",
    "P": "prominent",
    "O": "occupational",
    "E": "ethnic",
    "S": "stranger",
    "U": "uncertian",
}

Coding_age = {
    "A": "adult",
    "T": "teenager",
    "C": "child",
    "B": "baby",
}

#### Get Dremer-Bssed emotion from XLM ####
def get_Emotions(file="coded_dreams.xml", decode_characters=False):
    tree = ET.parse(file)
    root = tree.getroot()

    lst = []

    for collection in tqdm(root):

        gender = collection.findtext("sex")
        age    = collection.findtext("age")
        typ    = collection.findtext("type")
        name   = collection.findtext("name")
        idd    = collection.findtext("id")
        time   = collection.findtext("time")

        for dream in collection.findall("dream"):
            date   = dream.findtext("date")
            date   =  date if date != None else "Missing"
            number = dream.findtext("number")
            report = dream.findtext("report")

            try:
                n_wrds = len(word_tokenize(report))
            except:
                n_wrds = 0
            
            Char_Emot = {}
            for emot in dream.find("codings").findall("emot"): 
                E   = emot[0].text
                Chr = emot[1].text
                lcl_emot_lst = Char_Emot.get(Chr, [])
                lcl_emot_lst.append(E)
                Char_Emot[Chr] = lcl_emot_lst

            lst.append(
                    [
                    gender, age, typ, name, idd, time, 
                    date, number, report, n_wrds, 
                    Char_Emot
                    ]
            )
            
    return lst

def dict_to_text(emot_dct, decode_mode="Char"):
    fnl_str = "[STR]"
    
    if decode_mode=="Char":
        for k, v in emot_dct.items():
            emtion_lst = map(lambda e : Coding_emotions[e], v)
            ch = "dreamer" if k == "D" else get_full_Char(k)
            fnl_str = fnl_str+" The {} experienced {}.".format(ch, " and ".join(c))
        return fnl_str+"[END]"
    
    elif decode_mode=="Emot":
        emtion_lst = list(emot_dct.values())
        emtion_lst = list(itertools.chain.from_iterable(emtion_lst))
        emtion_lst = map(lambda e : Coding_emotions[e], emtion_lst)
        fnl_str = fnl_str+" The report contains {}.".format(" and ".join(emtion_lst))
        return fnl_str+"[END]"
    
    elif decode_mode=="EmotNn":
        emtion_lst = list(emot_dct.values())
        emtion_lst = list(itertools.chain.from_iterable(emtion_lst))
        emtion_lst = [
            "{} {}".format( 
                Coding_numerosity[emtion_lst.count(emtn_crnm)],
                Coding_emotions[emtn_crnm]
            ) 
            for emtn_crnm in Coding_emotions.keys()
            if emtion_lst.count(emtn_crnm) != 0 
        ]
        emtion_lst = " and ".join(emtion_lst)
        fnl_str = fnl_str+" The report contains {}".format(emtion_lst)
        return fnl_str+".[END]"
    
    else:
        print("No such method")
    

def get_full_Char(ch):
    if "+" not in ch:
        n, g, i, a = ch
        try:
            cmps_ch = " ".join(
                    [Coding_number[n], 
                     Coding_gendr[g], 
                     Coding_identity[i], 
                     Coding_age[a]]
            )
        except: 
            cmps_ch = "unknown character" 
    else:
        cmps_ch = ""
        for nmbr, c in enumerate(ch.split("+")):
            if nmbr != 0:
                cmps_ch = cmps_ch+" and the "
                
            # remove the sapce if there was between "+" and char
            c = c.replace(" ", "")
            
            # if its the dreameer again just call it...
            if c == "D": 
                cmps_ch = cmps_ch+"dreamer"
                
            # otherwise decode it
            else:
                n, g, i, a = c
                lcl_cmps_ch = " ".join(
                            [Coding_number[n], 
                             Coding_gendr[g], 
                             Coding_identity[i], 
                             Coding_age[a]]
                )
                cmps_ch = cmps_ch+lcl_cmps_ch
            
    return cmps_ch

## Collect Data

In [3]:
DECODE_MODE = "EmotNn"

dream_records_lst = get_Emotions(file="coded_dreams.xml")
dream_records = pd.DataFrame(
                    dream_records_lst, 
                    columns=[
                            "gender", "age", "type", "collection", "id", 
                            "time", "date", "number", "report", "# words", 
                            "Emotion_Set" 
                    ]
)

dream_records = dream_records[~dream_records["Emotion_Set"].isin([{}])].reset_index(drop=True)
print("Collected Reports: {}".format(len(dream_records)))

dream_records["Emotion_Text"] = [
    dict_to_text(d, decode_mode=DECODE_MODE) for d in dream_records["Emotion_Set"]
]

100%|████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 12.37it/s]

Collected Reports: 924





In [4]:
dream_records.sample(5)

Unnamed: 0,gender,age,type,collection,id,time,date,number,report,# words,Emotion_Set,Emotion_Text
646,F,Y,set,Hall/VdC Norms: Female,norms-f,1940s-1950s,Missing,357,I dreamt I was supposed to go fishing on a fai...,0,"{'D': ['HA'], '1FSA': ['SD']}",[STR] The report contains one sadness and one ...
147,F,YA,series,Barb Sanders: baseline,b-baseline,1960-1997,1996-10-28,3061,"The first moment I laid eyes on this white, la...",0,{'D': ['AP']},[STR] The report contains one apprehension.[END]
223,F,T,series,Bea 1: a high school student,bea1,2003-2005,08/20/2004 (age 15),181,I dreamed that Pauline and I were in American ...,0,{'D': ['AP']},[STR] The report contains one apprehension.[END]
241,F,T,series,Bea 1: a high school student,bea1,2003-2005,08/03/2005 (age 16),213,I dreamed that I was in some hospital waiting ...,0,"{'1MFA': ['AN'], 'D': ['SD']}",[STR] The report contains one anger and one sa...
101,F,YA,series,Barb Sanders: baseline,b-baseline,1960-1997,1992-04-18,2139,We are told to line up around the room and sta...,0,{'D': ['AN']},[STR] The report contains one anger.[END]


In [5]:
dream_records["Emotion_Text"][520]

'[STR] The report contains one anger and four apprehension and two confusion and two happiness.[END]'

In [6]:
dream_records["Emotion_Text"][316]

'[STR] The report contains one anger.[END]'

In [7]:
dream_records["Emotion_Text"][901]

'[STR] The report contains one anger and one happiness.[END]'

In [8]:
class YourDataSetClass(Dataset):
    
    """
    Creating a custom dataset for reading the dataset and
    loading it into the dataloader to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        
        self.tokenizer   = tokenizer
        self.data        = dataframe
        self.source_len  = source_len
        self.summ_len    = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 3000 == 0:
            print("Epoch  {} | Loss {}".format(str(epoch), str(loss.item())))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def validate(
    epoch, tokenizer, model, device, loader, repetition_penalty=2.5, length_penalty=1
):
    
    """
    Function to evaluate model for predictions

    """
    
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                  input_ids = ids,
                  attention_mask = mask, 
                  max_length=150, 
                  num_beams=2,
                  repetition_penalty=repetition_penalty, 
                  length_penalty=length_penalty, 
                  early_stopping=True
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]

            predictions.extend(preds)
            actuals.extend(target)
    
    print("Evualuation Completed")
    return predictions, actuals

def T5Trainer(
    dataframe, source_text, target_text, model_params, train_size=.8, save_model=False, output_dir="./outputs/"
):

    """
    T5 trainer

    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # Importing the raw dataset
    dataframe = dataframe[[source_text, target_text]]

    # Creation of Dataset and Dataloader
    train_dataset = dataframe.sample(frac=train_size, random_state=model_params["SEED"])
    val_dataset   = dataframe.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print(f"FULL Dataset: {dataframe.shape}")
    print(f"TRAIN Dataset: {train_dataset.shape}")
    print(f"TEST Dataset: {val_dataset.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = YourDataSetClass(
        train_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = YourDataSetClass(
        val_dataset,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader      = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    print("Start Training")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)
    
    if save_model:
        print("Saving Model")
        # Saving the model after training
        path = os.path.join(output_dir, "model_files")
        model.save_pretrained(path)
        tokenizer.save_pretrained(path)

    # evaluating test dataset
    print("Initiating Validation")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
        model_params_lst = ["-".join([k, str(v)]) for (k, v) in model_params.items()]
        model_params_lst = "_".join(model_params_lst)
        fl_name = "predictions_{}.csv".format(model_params_lst)
        final_df.to_csv(os.path.join(output_dir, fl_name), index=False)


In [10]:
# let's define model parameters specific to T5
model_params = {
    "Reports":DECODE_MODE,         # 
    "MODEL": "t5-large",           # model_type: t5-small/t5-base/t5-large
    "TRAIN_BATCH_SIZE": 4,         # training batch size
    "VALID_BATCH_SIZE": 4,         # validation batch size
    "TRAIN_EPOCHS": 10,            # number of training epochs
    "VAL_EPOCHS": 1,               # number of validation epochs
    "LEARNING_RATE": 1e-4,         # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 512, # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 50,  # max length of target text
    "SEED": 41,                    # set seed for reproducibility
}

In [11]:
# T5 accepts prefix of the task to be performed:
# Since we are summarizing, let's add summarize to source text as a prefix
dream_records["Training_text"] = "Label characters and emotions: " + dream_records["report"]

T5Trainer(
    dataframe=dream_records,
    source_text="Training_text",
    target_text="Emotion_Text",
    model_params=model_params,
    output_dir="outputs",
)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


FULL Dataset: (924, 2)
TRAIN Dataset: (739, 2)
TEST Dataset: (185, 2)

Start Training
Epoch  0 | Loss 10.576644897460938
Epoch  1 | Loss 0.22693918645381927
Epoch  2 | Loss 0.036941658705472946
Epoch  3 | Loss 0.16742733120918274
Epoch  4 | Loss 0.08933419734239578
Epoch  5 | Loss 0.021012742072343826
Epoch  6 | Loss 0.06732145696878433
Epoch  7 | Loss 0.048490289598703384
Epoch  8 | Loss 0.01919695921242237
Epoch  9 | Loss 0.037357158958911896
Initiating Validation
Evualuation Completed


In [12]:
model_params_lst = ["-".join([k, str(v)]) for (k, v) in model_params.items()]
model_params_lst = "_".join(model_params_lst)
model_results = pd.read_csv(
    "outputs/predictions_{}.csv".format(model_params_lst), 
)

In [13]:
model_results.sample(10)

Unnamed: 0,Generated Text,Actual Text
173,surprise. I could not move. the door was open....,[STR] The report contains one confusion.[END]
107,The report contains one sadness and one happin...,[STR] The report contains one happiness.[END]
128,mother was wearing blue jeans. Label character...,[STR] The report contains one happiness.[END]
61,The report contains one anger.? I think she sa...,[STR] The report contains one anger.[END]
83,The report contains one anger. we look in at a...,[STR] The report contains one anger.[END]
29,The film contains one happiness. The report co...,[STR] The report contains two happiness.[END]
71,"a hard, or a happy life."" The report contains ...",[STR] The report contains one apprehension and...
160,the story contains one anger and one happiness...,[STR] The report contains one happiness.[END]
182,The report contains one sadness and one confus...,[STR] The report contains one sadness and two ...
176,The report contains one apprehension. I was to...,[STR] The report contains one apprehension.[END]
