# Personality Analysis using a Bimodel LSTM Network

In [1]:
!pip3 install torch skorch transformers pandas

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import skorch
import torch

from IPython.display import display
from skorch import NeuralNet
from torch import nn, optim, tensor
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

from model.LstmModel import LstmModel
from utils import progress_bar

torch.manual_seed(0)
device = "cuda:0" if torch.cuda.is_available() else 'cpu'



## Prepare Dataset

In [3]:
# Path to the MyPersonality dataset
MY_PERSONALITY_PATH = "data/mypersonality.csv"

# Path to the Essays dataset
ESSAYS_PATH = "data/essays.csv"

# List of traits to analyse
TRAITS = ["cAGR", "cCON", "cEXT", "cOPN", "cNEU"]

# Max length of tokens when calculating embeddings
MAX_LENGTH = 300

# Specify what dataset to use, can be either ESSAYS or MY_PERSONALITY
DATASET_TO_USE = "ESSAYS"

# Specify was model from the Transformers library to use to calculate embeddings
EMBEDDINGS_MODEL = "bert-base-uncased"

In [19]:
from transformers import AutoModel, AutoTokenizer

from utils import progress_bar

class Dataset(torch.utils.data.Dataset):
    """Used to process the selected dataset for training and validation"""
    
    def __init__(self, texts, labels):
        self.embeddings = []
        self.embeddings_lengths = []
        self.generate_embeddings(texts)
        
        self.embeddings_lengths = torch.tensor(self.embeddings_lengths)
        self.labels = [
            torch.tensor([label.cAGR, label.cCON, label.cEXT, label.cOPN, label.cNEU])
            for label in labels.itertuples()
        ]

    def generate_embeddings(self, texts):
        """Generate word embeddings for all the texts"""
        
        with torch.no_grad():
            progress = display(progress_bar(0, 100), display_id=True)

            tokenizer = AutoTokenizer.from_pretrained(EMBEDDINGS_MODEL)

            embeddings_model = AutoModel.from_pretrained(EMBEDDINGS_MODEL).to(device)
            embeddings_model.eval()
        
            for i, text in enumerate(texts):
                encoded_text = tokenizer.encode_plus(
                    text.lower(),
                    add_special_tokens=True,
                    max_length=MAX_LENGTH,
                    pad_to_max_length=True,
                )
                input_ids = encoded_text["input_ids"]
                attention_mask = encoded_text["attention_mask"]

                input = torch.tensor(input_ids).to(device).unsqueeze(0)
                input_mask = torch.tensor(attention_mask).to(device).unsqueeze(0)
                
                output = embeddings_model(input, attention_mask=input_mask)[0]
                output = output.squeeze().to("cpu")

                self.embeddings.append(output)
                self.embeddings_lengths.append(len(input_ids))

                progress.update(progress_bar(i, len(texts)))
            

    def set_trait(self, trait):
        """Set the trait to use when fetching labels"""
        i = TRAITS.index(trait)
        self.trait_index = i

    def get_labels(self):
        """Get labels from the dataset given a selected trait"""
        return torch.tensor([label[self.trait_index] for label in self.labels])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return (
            self.embeddings[i],
            self.embeddings_lengths[i].item(),
            self.labels[i][self.trait_index],
        )

In [20]:
import pandas

def load_dataset():
    """Loads a dataset and returns a PyTorch Dataset with embeddings"""

    if DATASET_TO_USE == "MY_PERSONALITY":
        path = MY_PERSONALITY_PATH
        text_field = "STATUS"
    elif DATASET_TO_USE == "ESSAYS":
        path = ESSAYS_PATH
        text_field = "TEXT"

    df = pandas.read_csv(path, encoding="latin1")
    df[TRAITS] = df[TRAITS].replace(to_replace=["y","n"], value=[1.0,0.0])
    df[TRAITS] = df[TRAITS]
    df = df.rename(columns={text_field: "TEXT"})

    return Dataset(df["TEXT"], df[TRAITS])


def collate_fn(batch):
    """Pads the dataset so tokens are the same length"""

    (embeddings, embeddings_lengths, labels) = zip(*batch)

    embeddings_lengths, perm_indexes = torch.tensor(embeddings_lengths).sort(0, descending=True)
    
    embeddings = torch.stack(embeddings, dim=0)[perm_indexes]
    labels = torch.tensor(labels)[perm_indexes]

    input = {
        "embeddings": embeddings,
        "embeddings_lengths": embeddings_lengths
    }

    return input, labels

## Training

In [21]:
import csv

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from skorch.callbacks import Callback

def calculate_accuracy(net, dataset, y_true):
    """Calculates the accuracy of the model"""
    y_pred = np.rint(net.predict(dataset))
    return accuracy_score(y_true, y_pred)


def calculate_precision(net, dataset, y_true):
    """Calculates the precision of the model"""
    y_pred = net.predict(dataset)
    y_pred = np.rint(y_pred)
    return f1_score(y_true, y_pred)


def calculate_recall(net, dataset, y_true):
    """Calculates the recall of the model"""
    y_pred = net.predict(dataset)
    y_pred = np.rint(y_pred)
    return precision_score(y_true, y_pred)


def calculate_f1_score(net, dataset, y_true):
    """Calculates the F1 score of the model"""
    y_pred = net.predict(dataset)
    y_pred = np.rint(y_pred)
    return recall_score(y_true, y_pred)

class HiPlotLog(Callback):
    def on_epoch_end(self, net, **kwargs):
        current = net.history[-1]
        filename = DATASET_TO_USE.lower()

        with open(f"output/{filename}.csv", "a") as file:
            writer = csv.writer(file)
            writer.writerow([
                trait,
                current["epoch"],
                parameters["learning_rate"],
                parameters["dropout_input"],
                parameters["dropout_output"],
                parameters["weight_decay"],
                parameters["batch_size"],
                parameters["hidden_dim"],
                EMBEDDINGS_MODEL,
                "BCE",
                "sigmoid",
                current["train_loss"],
                current["valid_loss"],
                current["accuracy"],
                current["precision"],
                current["recall"],
                current["f1_score"]
            ])

class SaveBestModel(Callback):
    def on_epoch_end(self, net, **kwargs):
        current = net.history[-1]

        if current["accuracy_best"]:
            net.save_params(f_params=f"trained_models/{trait}.pt")

In [22]:
from skorch.callbacks import EpochScoring, ProgressBar

def train(parameters, trait, dataset):
    """Train the model and print the output"""
    
    print("Parameters")
    print(f"Batch Size: {parameters['batch_size']}")
    print(f"Learning Rate: {parameters['learning_rate']}")
    print(f"Max Epochs: {parameters['max_epochs']}")
    print(f"Input Dropout: {parameters['dropout_input']}")
    print(f"Output Dropout: {parameters['dropout_output']}")
    print(f"Weight Decay: {parameters['weight_decay']}")
    print(f"Hidden Dim: {parameters['hidden_dim']}")
    print(f"Cross Validation Split: {parameters['cross_validation_split']}")
    print(f"Embeddings Model: {EMBEDDINGS_MODEL}")
    print("\n")

    print(f"Training {trait}:")

    net = NeuralNet(
        module=LstmModel,
        module__dropout_input=parameters["dropout_input"],
        module__dropout_output=parameters["dropout_output"],
        module__hidden_dim=parameters["hidden_dim"],
        criterion=nn.BCELoss,
        optimizer=optim.Adam,
        optimizer__weight_decay=parameters["weight_decay"],
        optimizer__lr=parameters["learning_rate"],
        iterator_train__collate_fn=collate_fn,
        iterator_valid__collate_fn=collate_fn,
        iterator_train__shuffle=True,
        iterator_valid__shuffle=True,
        max_epochs=parameters["max_epochs"],
        batch_size=parameters["batch_size"],
        train_split=skorch.dataset.CVSplit(parameters["cross_validation_split"], stratified=True, random_state=0),
        callbacks=[
            EpochScoring(calculate_accuracy, name="accuracy", lower_is_better=False),
            EpochScoring(calculate_precision, name="precision", lower_is_better=False),
            EpochScoring(calculate_recall, name="recall", lower_is_better=False),
            EpochScoring(calculate_f1_score, name="f1_score", lower_is_better=False),
            EpochScoring(calculate_f1_score, name="f1_score", lower_is_better=False),
            HiPlotLog(),
            SaveBestModel(),
            ProgressBar(),
        ],
        device=device,
    )

    # Get labels for a specific trait
    dataset.set_trait(trait)
    y = dataset.get_labels()

    net.fit(dataset, y=y)

In [30]:
parameters = {
    "learning_rate": 0.0001,
    "max_epochs": 100,
    "batch_size": 128,
    "hidden_dim": 192,
    "dropout_input": 0.2,
    "dropout_output": 0,
    "weight_decay": 0.001,
    "cross_validation_split": 10
}

# Agreeableness
# trait = "cAGR"
# EMBEDDINGS_MODEL = "distilbert-base-uncased"
# dataset = load_dataset()
# train({**parameters, "max_epochs": 46}, trait, dataset)

# Conscientiousness
# trait = "cCON"
# EMBEDDINGS_MODEL = "roberta-base"
# dataset = load_dataset()
# train({**parameters, "max_epochs": 1}, trait, dataset)

# Extroversion
trait = "cEXT"
EMBEDDINGS_MODEL = "distilroberta-base"
train({**parameters, "max_epochs": 1}, trait, dataset)

# Openness
trait = "cOPN"
EMBEDDINGS_MODEL = "roberta-base"
train({**parameters, "max_epochs": 3}, trait, dataset)

# Neuroticism
trait = "cNEU"
EMBEDDINGS_MODEL = "bert-base-uncased"
train({**parameters, "max_epochs": 91}, trait, dataset)

Parameters
Batch Size: 128
Learning Rate: 0.0001
Max Epochs: 1
Input Dropout: 0.2
Output Dropout: 0
Weight Decay: 0.001
Hidden Dim: 192
Cross Validation Split: 10
Embeddings Model: distilroberta-base


Training cEXT:


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

  attention_score = torch.nn.functional.softmax(attention_score).view(


  epoch    accuracy    f1_score    precision    recall    train_loss    valid_loss     dur
-------  ----------  ----------  -----------  --------  ------------  ------------  ------
      1      [36m0.5101[0m      [32m0.9921[0m       [35m0.6756[0m    [31m0.5122[0m        [94m0.6952[0m        [36m0.6929[0m  4.9193
Parameters
Batch Size: 128
Learning Rate: 0.0001
Max Epochs: 3
Input Dropout: 0.2
Output Dropout: 0
Weight Decay: 0.001
Hidden Dim: 192
Cross Validation Split: 10
Embeddings Model: roberta-base


Training cOPN:


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

  attention_score = torch.nn.functional.softmax(attention_score).view(


Parameters
Batch Size: 128
Learning Rate: 0.0001
Max Epochs: 91
Input Dropout: 0.2
Output Dropout: 0
Weight Decay: 0.001
Hidden Dim: 192
Cross Validation Split: 10
Embeddings Model: bert-base-uncased


Training cNEU:


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

  attention_score = torch.nn.functional.softmax(attention_score).view(


## Analysis

In [None]:
import hiplot as hip

filename = DATASET_TO_USE.lower()
with open(f"output/{filename}.csv") as file:
    experiment = hip.Experiment.from_csv(file)
    experiment.parameters_definition["accuracy"].force_range(0.45, 0.75)
    experiment.display()