In [1]:
! nvidia-smi # bash command to controll the status of GPUs 

Fri Sep 30 15:16:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:03:00.0 Off |                  N/A |
| 30%   40C    P0   107W / 350W |      0MiB / 24576MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:04:00.0 Off |                  N/A |
| 30%   37C    P0   112W / 350W |      0MiB / 24576MiB |      0%      Default |
|       

In [1]:
import os, random, xmltodict, json
import xml.etree.ElementTree as ET

cuda_device = 6 # which GPU to use
os.environ["CUDA_VISIBLE_DEVICES"]= str(cuda_device) # set which GPU device are visible

In [2]:
# "Basic" py library
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split 
from nltk.tokenize import word_tokenize

# Stats 
from mlxtend.evaluate import permutation_test

# visualisation
from matplotlib import pyplot as plt # basic visualisation in py
import seaborn as sns # great to interact with dataframes
import plotly.express as px # powerfull for interactive figures
from tqdm import tqdm  # generats progress bar to controll steps

# ML py
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import torch # Pytorch, Meta's library for ML
import torch.nn as nn # Pt module for neural networks 

import transformers # HuggingFace library to use pretrained models
from transformers import BertTokenizer, BertModel

In [3]:
def set_seed(seed: int):
    """Helper function for reproducible behavior to set the seed in ``random``, 
        ``numpy``, ``torch`` and/or ``tf`` (if installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
    
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)
        
def get_encoding(sq, tokenizer, model, idx=0, layer=-1, device="cuda", truncate=False):
    """""
    given a sequence, model (and tokenizer) extract the encoding for the sequnce
    idx 0 --> CLS
    idx 1 --> token 1
    idx 2 --> token 2
    """
    inputs = tokenizer(sq, truncation=truncate, return_tensors='pt')
    inputs.to(device)
    outputs = model(**inputs, output_hidden_states=True)
    last_hidden_states = outputs.hidden_states[layer]
    CLS = last_hidden_states[0,idx,:] # CLS = [0,0,:], 
    CLS = CLS.to("cpu").detach().numpy()
    return CLS

def get_norm_reports():
    fl_lst = []
    for gender in ["m", "f"]:
        for fl in os.listdir("norms-{}".format(gender)):
            fl_name = "{}/norms-{}/{}".format(os.getcwd(),gender, fl)
            lcl_fl = open(fl_name, "r").read()
            fl_lst.append([lcl_fl, gender])
    df = pd.DataFrame(fl_lst, columns=["Report", "Gender"]) 
    
    return df

def underscore_label(ch_sq):
    splt_sq = list(ch_sq)
    n = splt_sq[0]
    lbl = "".join(splt_sq[1:])
    return "_".join([n, lbl])

In [4]:
# Set visual style of Seabonr
sns.set("talk")
sns.set_style("whitegrid")

#  set the random seed 
seed = 31
set_seed(seed)

# Table of Content

click on the titles to reach the described sections

---------
[Loading and Scraping Data](#intro)

[Dreams Encoding](#encoding)

## Loading and scraping data<a id='intro'></a>

In [6]:
#### Deprecated/Used code to convert XLM to csv ####
tree = ET.parse('coded_dreams.xml')
root = tree.getroot()

lst = []

for collection in tqdm(root):
    
    gender = collection.findtext("sex")
    age    = collection.findtext("age")
    typ    = collection.findtext("type")
    name   = collection.findtext("name")
    idd    = collection.findtext("id")
    time   = collection.findtext("time")
    
    for dream in collection.findall("dream"):
        date   = dream.findtext("date")
        date   =  date if date != None else "Missing"
        number = dream.findtext("number")
        report = dream.findtext("report")
        
        try:
            n_wrds = len(word_tokenize(report))
        except:
            n_wrds = 0
            
        lcl_labels = []

#         for ch in dream.find("codings").findall("char"): # collects Characters deprecated
#             lcl_labels.append(underscore_label(ch.text))

        for emot in dream.find("codings").findall("emot"): # collects emot(ions) of D(reamer)
            E   = emot[0].text
            Chr = emot[1].text
            if Chr == "D": # store if charcater of the omotion is the D(reamer)
                lcl_labels.append(E)
        
        
        lcl_labels = "Missing" if lcl_labels == [] else "_".join(lcl_labels)
        
        lst.append(
                [
                gender, age, typ, name, idd, time, 
                date, number, report, n_wrds, lcl_labels
                ]
        )

100%|█████████████████████████████████████████████| 7/7 [00:01<00:00,  4.82it/s]


In [7]:
dream_records = pd.DataFrame(
                    lst, 
                    columns=[
                            "gender", "age", "type", "collection", "id", 
                            "time", "date", "number", "report", "# words", "Emotions"
                    ]
)

dream_records["# Emotions"] = [
    len(e_lst.split("_")) if e_lst != "Missing" else 0
    for e_lst in dream_records["Emotions"]
]

# Save to .csv
# dream_records.to_csv("Reports_with_Dreamer_Emotions.csv", index=False)

In [8]:
dream_records = dream_records[dream_records["# words"] != 0]
dream_records.sample(3)

Unnamed: 0,gender,age,type,collection,id,time,date,number,report,# words,Emotions,# Emotions
1511,M,Y,set,Hall/VdC Norms: Male,norms-m,1940s-1950s,Missing,157,I walked out of the washroom at our country cl...,92,AP_SD,2
933,F,Y,set,Hall/VdC Norms: Female,norms-f,1940s-1950s,Missing,70,I had to take an exam in the psych lab. When I...,161,Missing,0
1830,M,Y,set,Hall/VdC Norms: Male,norms-m,1940s-1950s,Missing,476,In this dream I received a letter from my girl...,136,AN_SD,2


In [8]:
len(dream_records)

1855

## Dreams Encoding<a id='encoding'></a>

Collect the encodings (i.e., the vectors, for each report)

In [9]:
# # We need to identify the seq. lenght, so to not luse items with paddings
dream_records["# words"].describe()

count    1845.000000
mean      144.784282
std        78.240335
min         8.000000
25%        91.000000
50%       129.000000
75%       178.000000
max       610.000000
Name: # words, dtype: float64

In [10]:
# https://huggingface.co/bert-base-multilingual-cased
# we use BERT as no transfer is needed 
model_name = "bert-large-cased"
# max sequence length for each document/sentence sample
ml = 512

tokenizer = BertTokenizer.from_pretrained(model_name)
model     = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# # ditch the 9 misaing items
dream_records = dream_records[dream_records["# words"] != 0].reset_index(drop=True)

In [12]:
# we need to know which seq have more than 512 tokens and remove them
T_encoding = [tokenizer(sq, truncation=True) for sq in tqdm(dream_records["report"])]

100%|██████████████████████████████████████| 1845/1845 [00:02<00:00, 681.08it/s]


In [13]:
# # Collect the embeddings of each report according to the model
T_encoding = [get_encoding(sq, tokenizer, model, device="cpu", truncate=True) 
              for sq in tqdm(dream_records["report"])]

100%|███████████████████████████████████████| 1845/1845 [05:28<00:00,  5.61it/s]


In [20]:
T_encoding = np.array(T_encoding)
np.array(T_encoding).shape

(1845, 1024)

In [21]:
# with open('BERT-Large-Cased_dream_records.npy', 'wb') as f:
#     np.save(f, T_encoding)

In [22]:
# PCA reduction
pca     = PCA(n_components=2)
TKN_PCA = pca.fit_transform(T_encoding)

In [23]:
#t-SNE reduction
TKN_TSNE = TSNE(
            n_components=2,
            init='random'
).fit_transform(T_encoding)



In [24]:
# store in DF
dream_records["TSNE_x"], dream_records["TSNE_y"] = zip(*TKN_TSNE)
dream_records["PCA_x"], dream_records["PCA_y"]   = zip(*TKN_PCA)

In [25]:
# save data so far collected as .csv
dream_records.to_csv(
    "Reports_DreamerEmotions_PCAxy_tSNExy.csv",
    index=False,
)

# Deprecated Code 
Mostly setting nwe dataframe with old choordinats and custers for consisntency

In [20]:
# old_reports = pd.read_csv("Dreams_with_embeddings.csv")
# old_reports.columns

# for clmn_nm in ['Kmean Cluster C', 'Kmean Cluster G', 'SA label', 'SA score', 'Emotions']:
#     if clmn_nm == 'SA label':
#         lcl_nm = "2Way_SA_lable"
#     elif clmn_nm == 'SA score':
#         lcl_nm = "2Way_SA_score"
#     elif clmn_nm == 'Emotions':
#         lcl_nm = "5Way_SA_dict"
#     else:
#         lcl_nm = clmn_nm
#     dream_records[lcl_nm] = old_reports[clmn_nm]

In [19]:
# dream_records.to_csv(
#     "Reports_DreamerEmotions_PCACho_tsneCho_KMCluster2_KMCluster6_2WSA_5WSA.csv",
#     index=False,
# )

In [5]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import pandas as pd 
import torch
from tqdm import tqdm
import numpy as np

In [2]:
dream_records = pd.read_csv(
    "Reports_DreamerEmotions_PCACho_tsneCho_KMCluster2_KMCluster6_2WSA_6WSA.csv"
)

In [3]:
model_name = "bert-large-cased"

model     = AutoModelForMaskedLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
ppl_scores = []
for report in tqdm(dream_records["report"].to_list()):
    
    tensor_input = tokenizer.encode(report, return_tensors='pt')
    repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
    mask         = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
    masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels       = repeat_input.masked_fill(masked_input != tokenizer.mask_token_id, -100)
    
    with torch.no_grad():
        masked_input, labels = masked_input.to("cuda"), labels.to("cuda")
        loss = model(masked_input, labels=labels).loss
        loss = np.exp(loss.item())
        ppl_scores.append(loss)
        
        loss, masked_input, labels = loss.to("cpu"), masked_input.to("cpu"), labels.to("cpu")
        del loss, masked_input, labels

  0%|                                                  | 0/1845 [00:00<?, ?it/s]