# Personality Analysis using a Bimodel LSTM Network

In [None]:
!pip install torch skorch transformers hiplot

In [None]:
import warnings

import numpy as np
import skorch
import torch

from IPython.display import display
from skorch import NeuralNet
from torch import nn, optim, tensor
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

from aida.user_analysis.model import LstmModel
from aida.user_analysis.utils import progress_bar

warnings.simplefilter("ignore")

torch.manual_seed(0)
torch.cuda.empty_cache()

device = "cuda:0" if torch.cuda.is_available() else 'cpu'

## Prepare Dataset

In [None]:
# Path to the MyPersonality dataset
MY_PERSONALITY_PATH = "data/mypersonality.csv"

# Path to the essays dataset
ESSAYS_PATH = "data/essays.csv"

# List of traits to analyse
TRAITS = ["cEXT", "cNEU", "cAGR", "cCON", "cOPN"]

# Max length of tokens when calculating Bert embeddings
MAX_LENGTH = 400

# Specify what dataset to use, can be either ESSAYS or MY_PERSONALITY
DATASET = "MY_PERSONALITY"

In [None]:
from transformers import BertModel, BertTokenizer

class Dataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.labels = [self.encode_label(label) for label in labels.itertuples()]
        self.embeddings_lengths = []
        self.embeddings = []

        self.generate_embeddings(texts)

    def generate_embeddings(self, texts):
        progress = display(progress_bar(0, 100), display_id=True)

        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        bert = BertModel.from_pretrained("bert-base-uncased").to(device)
        bert.eval()

        with torch.no_grad():
            for i, text in enumerate(texts):
                encoded_text = tokenizer.encode_plus(
                    text.lower(),
                    add_special_tokens=True,
                    max_length=MAX_LENGTH,
                    pad_to_max_length=True,
                )
                input_ids = encoded_text["input_ids"]
                attention_mask = encoded_text["attention_mask"]

                input = torch.tensor(input_ids).to(device).unsqueeze(0)
                input_mask = torch.tensor(attention_mask).to(device).unsqueeze(0)

                output, _ = bert(input, attention_mask=input_mask)
                output = output.squeeze().to("cpu")

                self.embeddings.append(output)
                self.embeddings_lengths.append(len(input_ids))

                progress.update(progress_bar(i, len(texts)))

        del bert

    def encode_label(self, label):
        return [label.cEXT, label.cNEU, label.cAGR, label.cCON, label.cOPN]

    def set_trait(self, trait):
        i = TRAITS.index(trait)
        self.trait_index = i

    def get_labels(self):
        return torch.tensor([label[self.trait_index] for label in self.labels])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return (
            self.embeddings[i],
            self.embeddings_lengths[i],
            self.labels[i][self.trait_index],
        )

In [None]:
import pandas

def load_data_from_file(path, text_field):
    """Loads a dataset and returns a PyTorch Dataset with embeddings"""

    df = pandas.read_csv(path, encoding="latin1")
    df[TRAITS] = df[TRAITS].replace(to_replace=['y','n'], value=[1.0,0.0])
    df[TRAITS] = df[TRAITS]
    df = df.rename(columns={text_field: 'TEXT'})

    return df


def collate_fn(batch):
    """Pads the dataset so tokens are the same length"""

    (embeddings, embeddings_lengths, labels) = zip(*batch)

    embeddings_lengths, perm_indexes = torch.tensor(embeddings_lengths).sort(0, descending=True)
    
    embeddings = torch.stack(embeddings, dim=0)[perm_indexes]
    labels = torch.tensor(labels)[perm_indexes]

    input = {
        "embeddings": embeddings,
        "embeddings_lengths": embeddings_lengths
    }

    return input, labels

In [None]:
if DATASET == "MY_PERSONALITY":
    df = load_data_from_file(MY_PERSONALITY_PATH, "STATUS")
elif DATASET == "ESSAYS":
    df = load_data_from_file(ESSAYS_PATH, "TEXT")

dataset = Dataset(df["TEXT"], df[TRAITS])

## Training

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def calculate_accuracy(net, dataset, y_true):
    """Calculates the accuracy of the model"""
    y_pred = np.rint(net.predict(dataset))
    return accuracy_score(y_true, y_pred)


def calculate_precision(net, dataset, y_true):
    """Calculates the precision of the model"""
    y_pred = net.predict(dataset)
    y_pred = np.rint(y_pred)
    return f1_score(y_true, y_pred)


def calculate_recall(net, dataset, y_true):
    """Calculates the recall of the model"""
    y_pred = net.predict(dataset)
    y_pred = np.rint(y_pred)
    return precision_score(y_true, y_pred)


def calculate_f1_score(net, dataset, y_true):
    """Calculates the F1 score of the model"""
    y_pred = net.predict(dataset)
    y_pred = np.rint(y_pred)
    return recall_score(y_true, y_pred)

In [None]:
learning_rate = 0.00001
max_epochs = 1000
batch_size = 128

hidden_dim = 192
dropout_input = 0.2
dropout_output = 0
weight_decay = 0
cross_validation_split = 10

print("Parameters")
print(f"Batch Size: {batch_size}")
print(f"Learning Rate: {learning_rate}")
print(f"Max Epochs: {max_epochs}")
print(f"Input Dropout: {dropout_input}")
print(f"Output Dropout: {dropout_output}")
print(f"Weight Decay: {weight_decay}")
print(f"Hidden Dim: {hidden_dim}")
print(f"Cross Validation Split: {cross_validation_split}")
print("\n")

for trait in TRAITS:
    print(f"Training {trait}:")

    net = NeuralNet(
        module=LstmModel,
        module__dropout_input=dropout_input,
        module__dropout_output=dropout_output,
        module__hidden_dim=hidden_dim,
        criterion=nn.BCELoss,
        optimizer=optim.Adam,
        optimizer__weight_decay=weight_decay,
        optimizer__lr=learning_rate,
        iterator_train__collate_fn=collate_fn,
        iterator_valid__collate_fn=collate_fn,
        iterator_train__shuffle=True,
        iterator_valid__shuffle=True,
        max_epochs=max_epochs,
        batch_size=batch_size,
        train_split=skorch.dataset.CVSplit(cross_validation_split, stratified=True, random_state=0),
        callbacks=[
            skorch.callbacks.EpochScoring(
                calculate_accuracy, name="accuracy", lower_is_better=False
            ),
            skorch.callbacks.EpochScoring(
                calculate_precision, name="precision", lower_is_better=False
            ),
            skorch.callbacks.EpochScoring(
                calculate_recall, name="recall", lower_is_better=False
            ),
            skorch.callbacks.EpochScoring(
                calculate_f1_score, name="f1 score", lower_is_better=False
            ),
            skorch.callbacks.ProgressBar(),
        ],
        device=device,
    )

    # Get labels for a specific trait
    dataset.set_trait(trait)
    y = dataset.get_labels()

    net.fit(dataset, y=y)