In [1]:
! nvidia-smi # bash command to controll the status of GPUs

Fri Nov 11 09:12:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:03:00.0 Off |                  N/A |
| 30%   34C    P0   106W / 350W |      0MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:04:00.0 Off |                  N/A |
| 30%   32C    P0   110W / 350W |      0MiB / 24576MiB |      0%      Defaul

In [2]:
import os, random, xmltodict, json
import xml.etree.ElementTree as ET

cuda_device = 5 # which GPU to use
os.environ["CUDA_VISIBLE_DEVICES"]= str(cuda_device) # set which GPU device are visible

In [3]:
# "Basic" py library
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split 
from nltk.tokenize import word_tokenize

# visualisation
from tqdm import tqdm  # generats progress bar to controll steps

# ML py
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import torch # Pytorch, Meta's library for ML
import torch.nn as nn # Pt module for neural networks 

import transformers # HuggingFace library to use pretrained models
from transformers import BertTokenizer, BertModel

In [4]:
def set_seed(seed: int):
    """Helper function for reproducible behavior to set the seed in ``random``, 
        ``numpy``, ``torch`` and/or ``tf`` (if installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
    
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)
        
def get_encoding(sq, tokenizer, model, idx=0, layer=-1, device="cuda", truncate=False):
    """""
    given a sequence, model (and tokenizer) extract the encoding for the sequnce
    idx 0 --> CLS
    idx 1 --> token 1
    idx 2 --> token 2
    """
    inputs = tokenizer(sq, truncation=truncate, return_tensors='pt')
    inputs.to(device)
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[layer]
    CLS = last_hidden_states[0,idx,:] # CLS = [0,0,:], 
    CLS = CLS.to("cpu").detach().numpy()
    return CLS

def get_norm_reports():
    fl_lst = []
    for gender in ["m", "f"]:
        for fl in os.listdir("norms-{}".format(gender)):
            fl_name = "{}/norms-{}/{}".format(os.getcwd(),gender, fl)
            lcl_fl = open(fl_name, "r").read()
            fl_lst.append([lcl_fl, gender])
    df = pd.DataFrame(fl_lst, columns=["Report", "Gender"]) 
    
    return df

def underscore_label(ch_sq):
    splt_sq = list(ch_sq)
    n = splt_sq[0]
    lbl = "".join(splt_sq[1:])
    return "_".join([n, lbl])

In [5]:
#  set the random seed 
seed = 31
set_seed(seed)

# Table of Content

click on the titles to reach the described sections

---------
[Loading and Scraping Data](#intro)

[Dreams Encoding](#encoding)

[Get PCA /. t-SNE](#dimred)

The notebook limits to data collection. First, data (i.e., the general and dreamer-based emtions) is scraped from the original .xlm file, converted and saved as `.csv` file. Afterwards,  A `BERT-large-cased` model is used to collect the encodings of each report, that re then stored as a `numpy` array. Lastly, encodings are used to obtain the reduction of the reports' space according to PCA and t-SNE methods.

## Loading and scraping data<a id='intro'></a>

In [6]:
#### Get Dremer-Bssed emotion from XLM ####
def get_Emotions(file="coded_dreams.xml"):
    tree = ET.parse(file)
    root = tree.getroot()

    lst = []

    for collection in tqdm(root):

        gender = collection.findtext("sex")
        age    = collection.findtext("age")
        typ    = collection.findtext("type")
        name   = collection.findtext("name")
        idd    = collection.findtext("id")
        time   = collection.findtext("time")

        for dream in collection.findall("dream"):
            date   = dream.findtext("date")
            date   =  date if date != None else "Missing"
            number = dream.findtext("number")
            report = dream.findtext("report")

            try:
                n_wrds = len(word_tokenize(report))
            except:
                n_wrds = 0

            all_emotions_labels     = []
            dreamer_emotions_labels = []

    #         for ch in dream.find("codings").findall("char"): # collects Characters deprecated
    #             lcl_labels.append(underscore_label(ch.text))

            for emot in dream.find("codings").findall("emot"): # collects emot(ions) of D(reamer)
                E   = emot[0].text
                Chr = emot[1].text
                all_emotions_labels.append(E)
                if Chr == "D": # store if charcater of the omotion is the D(reamer)
                    dreamer_emotions_labels.append(E)
                    
            all_emotions_labels     = "Missing" if all_emotions_labels == [] else "_".join(all_emotions_labels)
            dreamer_emotions_labels = "Missing" if dreamer_emotions_labels == [] else "_".join(dreamer_emotions_labels)

            lst.append(
                    [
                    gender, age, typ, name, idd, time, 
                    date, number, report, n_wrds, 
                    all_emotions_labels, dreamer_emotions_labels
                    ]
            )
            
    return lst

In [7]:
dream_records_lst = get_Emotions()
dream_records = pd.DataFrame(
                    dream_records_lst, 
                    columns=[
                            "gender", "age", "type", "collection", "id", 
                            "time", "date", "number", "report", "# words", 
                            "All Emotions", "Dreamer Emotions" 
                    ]
)

dream_records["# Dreamer Emotions"] = [
    len(e_lst.split("_")) if e_lst != "Missing" else 0
    for e_lst in dream_records["Dreamer Emotions"]
]

dream_records["# General Emotions"] = [
    len(e_lst.split("_")) if e_lst != "Missing" else 0
    for e_lst in dream_records["All Emotions"]
]

100%|█████████████████████████████████████████████| 7/7 [00:01<00:00,  4.84it/s]


In [8]:
# Save to .csv
dream_records.to_csv("Reports_with_Dreamer_and_General_Emotions.csv", index=False)

Get a general idea of the final file!

In [12]:
dream_records = dream_records[dream_records["# words"] != 0]
dream_records.sample(3)

Unnamed: 0,gender,age,type,collection,id,time,date,number,report,# words,All Emotions,Dreamer Emotions,# Dreamer Emotions,# General Emotions
459,M,A,series,Ed: dreams of his late wife,ed,1980-2002,11/07/85,39,Mary and her sister Kathy are busy repairing a...,375,AP_HA_HA_HA_HA,AP_HA_HA,3,5
380,F,T,series,Bea 1: a high school student,bea1,2003-2005,08/17/2004 (age 15),180,I dreamed that the girls from my Spain program...,242,SD_AN,Missing,0,2
905,F,Y,set,Hall/VdC Norms: Female,norms-f,1940s-1950s,Missing,42,I dreamt that a friend of mine who graduated l...,146,HA,HA,1,1


## Dreams Encoding<a id='encoding'></a>

Collect the encodings (i.e., the vectors, for each report)

In [13]:
# # We need to identify the seq. lenght, so to not luse items with paddings
dream_records["# words"].describe()

count    1845.000000
mean      144.784282
std        78.240335
min         8.000000
25%        91.000000
50%       129.000000
75%       178.000000
max       610.000000
Name: # words, dtype: float64

In [14]:
# https://huggingface.co/bert-base-multilingual-cased
# we use BERT as no transfer is needed 
model_name = "bert-large-cased"
# max sequence length for each document/sentence sample
ml = 512

tokenizer = BertTokenizer.from_pretrained(model_name)
model     = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# ditch the 9 misaing items
dream_records = dream_records[dream_records["# words"] != 0].reset_index(drop=True)

In [16]:
# we need to know which seq have more than 512 tokens and remove them
T_encoding = [tokenizer(sq, truncation=True) for sq in tqdm(dream_records["report"])]

100%|██████████████████████████████████████| 1845/1845 [00:02<00:00, 659.73it/s]


In [17]:
# # Collect the embeddings of each report according to the model
T_encoding = [get_encoding(sq, tokenizer, model, device="cpu", truncate=True) 
              for sq in tqdm(dream_records["report"])]

100%|███████████████████████████████████████| 1845/1845 [04:15<00:00,  7.21it/s]


In [18]:
T_encoding = np.array(T_encoding)
np.array(T_encoding).shape

(1845, 1024)

Save the encodings (i.e., BERT's vectors) as numpy array. This option is preferable to store the encodings to the DataFrame as Pandas will convert them into strigs, making it diffucult in future uplading to use them.

In [19]:
with open('BERT-Large-Cased_dream_records.npy', 'wb') as f:
    np.save(f, T_encoding)

## Get PCA / t-SNE<a id='dimred'></a>

In [20]:
# PCA reduction
pca     = PCA(n_components=2)
TKN_PCA = pca.fit_transform(T_encoding)

In [21]:
#t-SNE reduction
TKN_TSNE = TSNE(
            n_components=2,
            init='random'
).fit_transform(T_encoding)



In [22]:
# store in DF
dream_records["TSNE_x"], dream_records["TSNE_y"] = zip(*TKN_TSNE)
dream_records["PCA_x"], dream_records["PCA_y"]   = zip(*TKN_PCA)

In [23]:
# save data so far collected as .csv
dream_records.to_csv(
    "Reports_with_Dreamer_and_General_Emotions_PCAxy_tSNExy.csv",
    index=False,
)