This notebook is downloaded and adapted from the GitHub repo of the authors of MoralBERT (https://github.com/vjosapreniqi/MoralBERT/tree/main)

# Predicting Moral Values in Text
### This Code offers predicting moral values from the MoralBERT weights deployad in Hugging Face.

In [None]:
# Libraries:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

### Function to create documents for analyses

In [None]:
selfimprovement = "/content/drive/My Drive/UChicago/Tesis/cleaned_data1.csv"
investing = "/content/drive/My Drive/UChicago/Tesis/investing_cleaned.csv"
homeowners = "/content/drive/My Drive/UChicago/Tesis/homeowners_cleaned.csv"

In [None]:
def create_documents(path, n):
  df = pd.read_csv(path)

  # Sort based on the "score" column in descending order
  sorted_df = df.sort_values(by='score', ascending=False)

  # Select the top rows
  top_df = sorted_df.head(n)

  # Create list of documents

  lst_docs = top_df['cleaned_text'].tolist()

  return lst_docs

In [None]:
selfimprovement_list = create_documents(selfimprovement, 200)
investing_list = create_documents(investing, 200)
homeowners_list = create_documents(homeowners, 200)

  df = pd.read_csv(path)
  df = pd.read_csv(path)


In [None]:
# BERT model and tokenizer:
bert_model = AutoModel.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
class MyModel(
    nn.Module,
    PyTorchModelHubMixin,
    # optionally, you can add metadata which gets pushed to the model card
    # repo_url="your-repo-url",
    pipeline_tag="text-classification",
    license="mit",
):
    def __init__(self, bert_model, moral_label=2):

        super(MyModel, self).__init__()
        self.bert = bert_model
        bert_dim = 768
        self.invariant_trans = nn.Linear(768, 768)
        self.moral_classification = nn.Sequential(nn.Linear(768,768),
                                                      nn.ReLU(),
                                                      nn.Linear(768, moral_label))

    def forward(self, input_ids, token_type_ids, attention_mask):
        pooled_output = self.bert(input_ids,
                                token_type_ids = token_type_ids,
                                attention_mask = attention_mask).last_hidden_state[:,0,:]


        pooled_output = self.invariant_trans(pooled_output)


        logits = self.moral_classification(pooled_output)

        return logits

In [None]:
def preprocessing(input_text, tokenizer):
    '''
    Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
    '''
    return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 150,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_token_type_ids = True,  # Add this line
                        return_tensors = 'pt',
                        truncation=True
                   )

In [None]:
# the list of Moral (MFT) values
mft_values = ["care", "harm", "fairness", "cheating", "loyalty", "betrayal",
              "authority", "subversion", "purity", "degradation"]

# function to load the model, predict the score, and return the second value
def get_model_score(sentence, mft):
    repo_name = f"vjosap/moralBERT-predict-{mft}-in-text"

    # loading the model
    model = MyModel.from_pretrained(repo_name, bert_model=bert_model)

    # preprocessing the text
    encodeds = preprocessing(sentence, tokenizer)

    # predicting the mft score
    output = model(**encodeds)
    score = F.softmax(output, dim=1)

    # extracting and return the second value from the tensor
    mft_value = score[0, 1].item()

    return mft_value

def analyze_corpus(sentences, corpus_name):
  # initialising a list to accumulate the results
  results = []

  # sequential execution of predictions
  for sentence in sentences:
      # dictionary to store scores for the current sentence
      sentence_scores = {"sentence": sentence}

      # iterate through each MFT model and get the score
      for mft in mft_values:
          sentence_scores[mft] = get_model_score(sentence, mft)

      results.append(sentence_scores)

  results_df = pd.DataFrame(results)

  # save the DataFrame to a CSV file
  results_df.to_csv("/content/drive/My Drive/UChicago/Tesis/MoralBERTresults-{}.csv".format(corpus_name), index=False)


In [None]:
analyze_corpus(selfimprovement_list, "selfimprovement")

In [None]:
analyze_corpus(investing_list, "investing")

In [None]:
analyze_corpus(homeowners_list, "homeowners")