# Personality Analysis using a Bimodel LSTM Network

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ee/fc/bd726a15ab2c66dc09306689d04da07a3770dad724f0883f0a4bfb745087/transformers-2.4.1-py3-none-any.whl (475kB)
[K     |▊                               | 10kB 28.2MB/s eta 0:00:01[K     |█▍                              | 20kB 1.7MB/s eta 0:00:01[K     |██                              | 30kB 2.5MB/s eta 0:00:01[K     |██▊                             | 40kB 1.7MB/s eta 0:00:01[K     |███▍                            | 51kB 1.9MB/s eta 0:00:01[K     |████▏                           | 61kB 2.3MB/s eta 0:00:01[K     |████▉                           | 71kB 2.5MB/s eta 0:00:01[K     |█████▌                          | 81kB 2.7MB/s eta 0:00:01[K     |██████▏                         | 92kB 3.0MB/s eta 0:00:01[K     |██████▉                         | 102kB 2.7MB/s eta 0:00:01[K     |███████▋                        | 112kB 2.7MB/s eta 0:00:01[K     |████████▎                       | 122kB 2.7M

In [11]:
import warnings

import pandas
import torch
from google.colab import drive, files
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from torch import nn, optim, tensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils import data
from transformers import BertModel, BertTokenizer

warnings.simplefilter("ignore")

drive.mount("/content/drive")
DRIVE_PATH = "/content/drive/My Drive/aida"

torch.manual_seed(0)
device = "cuda:0" if torch.cuda.is_available() else 'cpu'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Prepare dataset

In [3]:
MY_PERSONALITY_PATH = DRIVE_PATH + "/mypersonality.csv"
TRAITS = ["sEXT", "sNEU", "sAGR", "sCON", "sOPN"]

# Portion of the dataset to use for testing
TEST_SPLIT_SIZE = 0.2

# Portion of the training set to use for validation
VALIDATION_SPLIT_SIZE = 0.1

def load_my_personality_dataset(path=MY_PERSONALITY_PATH):
    """Loads the texts and traits from the MyPersonality dataset"""

    data = pandas.read_csv(path, encoding="latin1")
    return data[["STATUS", *TRAITS]]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
class MyPersonalityDataset(data.Dataset):
    """Stores the MyPersonality dataset"""

    MIN_LABEL_VALUE = 1
    MAX_LABEL_VALUE = 5

    def __init__(self, texts, labels):
        self.texts = [torch.tensor(self.tokenize(text)) for text in texts]
        self.labels = [torch.tensor(self.normalize(label)) for label in labels]

    def tokenize(self, text):
        text = text.lower()
        return tokenizer.encode(text, add_special_tokens=True)

    def normalize(self, label):
        return (label - self.MIN_LABEL_VALUE) / (self.MAX_LABEL_VALUE - self.MIN_LABEL_VALUE)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        return self.texts[index], self.labels[index]

In [0]:
dataset = load_my_personality_dataset(MY_PERSONALITY_PATH)

def pad_collate(batch):
    """Pads the dataset so tokens are the same length"""

    (tokens, labels) = zip(*batch)
    
    padded_tokens = pad_sequence(tokens, batch_first=True, padding_value=0)
    token_lengths = [len(token) for token in tokens]
    labels = torch.tensor(labels)

    return padded_tokens, token_lengths, labels

def generate_dataset(trait, params):
    """Generates the training and validation sets for a given trait in the MyPersonality dataset"""

    text, labels = dataset["STATUS"], dataset[trait]

    # Split dataset into training and test sets
    train_text, test_text, train_labels, test_labels = train_test_split(
        text, labels, test_size=TEST_SPLIT_SIZE, random_state=0
    )

    # Split training into training and validation sets
    train_text, val_text, train_labels, val_labels = train_test_split(
        text, labels, test_size=VALIDATION_SPLIT_SIZE, random_state=0
    )

    training_set = MyPersonalityDataset(train_text[:500], train_labels[:500])
    training_generator = data.DataLoader(training_set, collate_fn=pad_collate, **params)

    validation_set = MyPersonalityDataset(val_text, val_labels)
    validation_generator = data.DataLoader(validation_set, collate_fn=pad_collate, **params)

    test_set = MyPersonalityDataset(test_text, test_labels)
    test_generator = data.DataLoader(test_set, collate_fn=pad_collate, **params)
    
    return training_generator, validation_generator, test_generator

## LSTM Model

In [0]:
class Attention(nn.Module):
    """An attention layer used by the LSTM"""
    def __init__(self, attention_size):
        super(Attention, self).__init__()
        self.attention = self.generate_attention_vector(attention_size, 1)

        # Use GPU if available
        self.to(device)

    def generate_attention_vector(self, *size):
        out = torch.FloatTensor(*size).to(device)
        torch.nn.init.xavier_normal_(out)
        return out
        
    def forward(self, x_in):
        attention_score = torch.matmul(x_in, self.attention).squeeze()
        attention_score = nn.functional.softmax(attention_score).view(x_in.size(0), x_in.size(1), 1)
        scored_x = x_in * attention_score

        condensed_x = torch.sum(scored_x, dim=1)

        return condensed_x

In [0]:
class LstmModel(nn.Module):
    """LSTM model to predict personality"""

    def __init__(self, emedding_dim=768, hidden_dim=1536, output_dim=1):
        super(LstmModel, self).__init__()
        
        # Model structure
        self.word_embeddings = BertModel.from_pretrained('bert-base-uncased').to(device)
        self.lstm_1 = nn.LSTM(emedding_dim, hidden_dim, batch_first=True)
        self.attention = Attention(hidden_dim)
        self.output = nn.Sequential(nn.Linear(hidden_dim, output_dim), nn.Sigmoid())
        
        # Use GPU if available
        self.to(device)
    
    def forward(self, tokens):
        emeddings = self.word_embeddings(tokens)[0]
        lstm_output = self.lstm_1(emeddings)[0]
        attention_output = self.attention(lstm_output)
        result = self.output(attention_output)

        return result

## Training

In [0]:
def train(model, loss_function, optimizer, data_generator):
    """Trains the model given a training set"""

    for epoch in range(max_epochs):
        losses = []

        for padded_tokens, token_lengths, labels in data_generator:
            model.zero_grad()

            padded_tokens = padded_tokens.to(device)
            labels = labels.to(device)

            model_trait_scores = model(padded_tokens)
            
            loss = loss_function(model_trait_scores, labels)
            losses.append(float(loss))

            loss.backward()
            optimizer.step()
        
        average_loss = np.mean(losses)

        print(f"Epoch: {epoch} | Avg Loss: {average_loss}")

In [0]:
"""Traings the models for all traits"""

# Hyperparameters
learning_rate = 0.0001
max_epochs = 50
dataloader_params = {
    "batch_size": 16,
    "shuffle": False,
    "num_workers": 6
}

# Saves models if set to true
save_models = True

print("Begin Training...")

for trait in TRAITS:
    model = LstmModel()
    loss_function = nn.BCELoss()
    optimizer = optim.Adam((p for p in model.parameters() if p.requires_grad), lr=learning_rate)

    training_generator, validation_generator, test_generator = generate_dataset(trait, dataloader_params)

    print(f"\nTraining {trait}:")

    train(model, loss_function, optimizer, training_generator)
    evaluate(model, test_generator)

    if save_models:
        torch.save(model.state_dict(), f"{DRIVE_PATH}/{trait}_model_state.pth")

Begin Training...

Training sEXT:
Epoch: 0 | Avg Loss: 0.6757059395313263
Epoch: 1 | Avg Loss: 0.6767847668379545
Epoch: 2 | Avg Loss: 0.6711772419512272
Epoch: 3 | Avg Loss: 0.6713369693607092
Epoch: 4 | Avg Loss: 0.6683808341622353
Epoch: 5 | Avg Loss: 0.6566351186484098
Epoch: 6 | Avg Loss: 0.6355478540062904
Epoch: 7 | Avg Loss: 0.6085198409855366
Epoch: 8 | Avg Loss: 0.6021594144403934
Epoch: 9 | Avg Loss: 0.6012177541851997
Epoch: 10 | Avg Loss: 0.5923305070027709
Epoch: 11 | Avg Loss: 0.5847394037991762
Epoch: 12 | Avg Loss: 0.5801328886300325
Epoch: 13 | Avg Loss: 0.5804642252624035
Epoch: 14 | Avg Loss: 0.5813030395656824
Epoch: 15 | Avg Loss: 0.5806224681437016
Epoch: 16 | Avg Loss: 0.5848505068570375
Epoch: 17 | Avg Loss: 0.5822806693613529
Epoch: 18 | Avg Loss: 0.582785808481276
Epoch: 19 | Avg Loss: 0.5795560879632831


## Evaluation

In [0]:
def evaluate(model, data_generator):
    """Evaluates a model given a validation/test set"""

    errors = []

    with torch.no_grad():
        for padded_tokens, token_lengths, labels in data_generator:
            padded_tokens = padded_tokens.to(device)
            
            model_trait_scores = model(padded_tokens)            
            model_trait_scores = model_trait_scores.cpu()

            errors = mean_absolute_error(labels, model_trait_scores)
        
    accuracy = 1 - np.mean(errors)
    print("Accuracy: " + str(accuracy) + '\n')

In [11]:
"""Evaluates saved models"""

# Hyperparameters
dataloader_params = {
    "batch_size": 16,
    "shuffle": False,
    "num_workers": 6
}

print("Begin Evaluation...\n")

training_generator, validation_generator, test_generator = generate_dataset(trait, dataloader_params)

for trait in TRAITS:
    model = LstmModel()
    model.load_state_dict(torch.load(f"{DRIVE_PATH}/{trait}_model_state.pth"))
    model.eval()

    print("Evaluating " + trait)

    evaluate(model, test_generator)

Begin Evaluation...



KeyboardInterrupt: ignored